This module builds on code contained in Coronavirus_Statistics_USAF_v004.Rmd. This file includes the latest code for analyzing data from USA Facts. USA Facts maintains data on cases and deaths by county for coronavirus in the US. Downloaded data are unique by county with date as a column and a separate file for each of cases, deaths, and population.
The intent of this code is to source updated functions that allow for readRunUSAFacts() to be run to obtain, read, process, and analyze data from USA Facts.
The tidyverse library is loaded, and the functions used for CDC daily processing are sourced. Additionally, specific functions for USA Facts are also sourced:
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.1 v dplyr 1.0.6
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# Functions are available in source file
source("./Generic_Added_Utility_Functions_202105_v001.R")
source("./Coronavirus_CDC_Daily_Functions_v001.R")
source("./Coronavirus_USAF_Functions_v001.R")
Further, the mapping file specific to USA Facts is sourced:
source("./Coronavirus_USAF_Default_Mappings_v001.R")
A few functions should be added back to Generic_Added_…v001.R and Coronavirus_CDC_Daily…_v001.R after they have been more thoroughly checked for compatibility with the state-based clustering. For now, they are included below so as to over-write the function obtained from source(…):
# Updated function for handling county-level clusters
createSummary <- function(df,
stateClusterDF=NULL,
brewPalette=NA,
dataType="state"
) {
# FUNCTION ARGUMENTS:
# df: an integrated data frame by cluster-date
# stateClusterDF: a data frame containing state-cluster (NULL means it can be found in df)
# brewPalette: character string for a palette from RColorBrewer to be used (NA means default colors)
# dataType: the type of maps being produced ("state" or "county")
# Create plots that can be relevant for a dashboard, including:
# 1. Map of segments
# 2. Bar plot of counts by segment
# 3. Facetted bar plot of segment descriptors (e.g., population, burden per million)
# 4. Facetted trend-line plot of burden by segments
# Create a map of the clusters
p1 <- helperSummaryMap(if(is.null(stateClusterDF)) df else stateClusterDF,
mapLevel=if(dataType=="state") "states" else "counties",
keyCol=if(dataType=="state") "state" else "countyFIPS",
discreteValues=TRUE,
labelScale=is.na(brewPalette),
textLabel=if(dataType=="state") c("RI", "CT", "DE", "MD", "DC") else c(),
extraArgs=if(is.na(brewPalette)) list() else
list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
)
# Create a bar plot of counts by segment
p2 <- helperSummaryMap(if(is.null(stateClusterDF)) df else stateClusterDF,
mapLevel=if(dataType=="state") "states" else "counties",
keyCol=if(dataType=="state") "state" else "countyFIPS",
discreteValues=TRUE,
labelScale=is.na(brewPalette),
countOnly=TRUE,
extraArgs=if(is.na(brewPalette)) list() else
list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
)
# Create plot for population and burden by cluster
p3 <- df %>%
helperAggTotal(aggVars=c("pop", "wm_tcpm7", "wm_tdpm7"),
mapper=c("pop"="Population (millions)",
"wm_tcpm7"="Cases per thousand",
"wm_tdpm7"="Deaths per million"
),
xLab=NULL,
yLab=NULL,
title=NULL,
divideBy=c("pop"=1000000, "wm_tcpm7"=1000),
extraArgs=if(is.na(brewPalette)) list() else
list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
)
# Create plot for cumulative burden per million over time
p4xtra <- list(arg1=scale_x_date(date_breaks="2 months", date_labels="%b-%y"),
arg2=theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
)
if(!is.na(brewPalette)) p4xtra$arg3 <- scale_color_brewer("Cluster", palette=brewPalette)
p4 <- df %>%
helperAggTrend(aggVars=append(c("wm_tcpm7", "wm_tdpm7"), if(dataType=="state") "wm_hpm7" else NULL),
mapper=c("wm_tcpm7"="Cases per thousand\n(cumulative)",
"wm_tdpm7"="Deaths per million\n(cumulative)",
"wm_hpm7"="Hospitalized per million\n(current)"
),
yLab=NULL,
title=NULL,
divideBy=c("wm_tcpm7"=1000),
linesize=0.75,
extraArgs=p4xtra
)
list(p1=p1, p2=p2, p3=p3, p4=p4)
}
# Helper function to make a summary map
helperSummaryMap <- function(df,
mapLevel="states",
keyCol="state",
values="cluster",
discreteValues=NULL,
legend.position="right",
labelScale=TRUE,
extraArgs=list(),
countOnly=FALSE,
textLabel=c(),
...
) {
# FUNCTION ARGUMENTS:
# df: a data frame containing a level of geography and an associated cluster
# mapLevel: a parameter for whether the map is "states" or "counties"
# keyCol: the key column for plotting (usmap::plot_usmap is particular, and this must be 'state' or 'fips')
# values: the character name of the field containing the data to be plotted
# discreteValues: boolean for whether the values are discrete (if not, use continuous)
# NULL means infer from data
# legend.position: character for the location of the legend in the plot
# labelScale: boolean, should an scale_fill_ be created? Use FALSE if contained in extraArgs
# extraArgs: list of other arguments that will be appended as '+' to the end of the usmap::plot_usmap call
# countOnly: should a bar plot of counts only be produced?
# textLabel: a list of elements that should be labelled as text on the plot (too small to see)
# ...: other parameters to be passed to usmap::plot_usmap (e.g., labels, include, exclude, etc.)
# Modify the data frame to contain only the relevant data
df <- df %>%
select(all_of(c(keyCol, values))) %>%
distinct()
# Determine the type of data being plotted
if (is.null(discreteValues)) discreteValues <- !is.numeric(df[[values]])
# Convert data type if needed
if (isTRUE(discreteValues) & is.numeric(df[[values]]))
df[[values]] <- factor(df[[values]])
# If count only is needed, create a count map; otherwise create a map
if (isTRUE(countOnly)) {
gg <- df %>%
ggplot(aes(x=fct_rev(get(values)))) +
geom_bar(aes_string(fill=values)) +
stat_count(aes(label=..count.., y=..count../2),
geom="text",
position="identity",
fontface="bold"
) +
coord_flip() +
labs(y="Number of members", x="")
} else {
if(keyCol=="countyFIPS") {
df <- df %>% colRenamer(vecRename=c("countyFIPS"="fips"))
keyCol <- "fips"
}
gg <- usmap::plot_usmap(regions=mapLevel, data=df, values=values, ...)
if (length(textLabel) > 0) {
labDF <- df %>%
filter(get(keyCol) %in% textLabel) %>%
mutate(rk=match(get(keyCol), textLabel)) %>%
arrange(rk) %>%
mutate(lon=-70.1-seq(0, 0.8*length(textLabel)-0.8, by=0.8),
lat=40.1-seq(0, 1.5*length(textLabel)-1.5, by=1.5)
) %>%
select(lon, lat, everything()) %>%
usmap::usmap_transform()
gg <- gg + geom_text(data=labDF,
aes(x=lon.1, y=lat.1, label=paste(get(keyCol), get(values))),
size=3.25
)
}
}
# Position the legend as requested
gg <- gg + theme(legend.position=legend.position)
# Create the scale if appropriate
if (isTRUE(labelScale)) gg <- gg +
if(isTRUE(discreteValues)) scale_fill_discrete(values) else scale_fill_continuous(values)
# Apply extra arguments
for (ctr in seq_along(extraArgs)) gg <- gg + extraArgs[[ctr]]
# Return the map object
gg
}
# Function to pivot the data file longer
pivotData <- function(df,
pivotKeys,
nameVar="name",
valVar="value",
toLonger=TRUE,
...
) {
# FUNCTION ARGUMENTS:
# df: the data frame
# pivotKeys: the keys (everything but cols for pivot_longer, id_cols for pivot_wider)
# nameVar: variable name for names_to or names_from
# valVar: variable name for values_to or values_from
# toLonger: boolean, should pivot_longer() be used rather than pivot_wider()?
# ...: other arguments to be passed to pivot_*()
if (isTRUE(toLonger)) pivot_longer(df, -all_of(pivotKeys), names_to=nameVar, values_to=valVar, ...)
else pivot_wider(df, all_of(pivotKeys), names_from=all_of(nameVar), values_from=all_of(valVar), ...)
}
An existing processed USA Facts list is loaded for use as the comparison set:
cty_newformat_20201026 <- readFromRDS("cty_newformat_20201026")
A function is added to create a county-level cluster map with state borders:
sparseCountyClusterMap <- function(vec,
clustRemap=c("999"=NA),
caption=NULL,
brewPalette=NULL,
naFill="white"
) {
# FUNCTION ARGUMENTS:
# vec: a named vector where the names are countyFIPS and the values are the clusters
# can also be a data.frame, in which case clustersToFrame() and clustRemap are not used
# clustRemap: remapping vector for clusters
# caption: caption to be included (NULL means no caption)
# brewPalette: name of a palette for scale_fill_brewer()
# NULL means use scale_fill_discrete() instead
# naFill: fill to use for NA counties
# Convert to a tibble for use with usmap if not already a data.frame
if ("data.frame" %in% class(vec)) {
df <- vec
} else {
df <- clustersToFrame(vec, colNameName="fips", convFactor=FALSE) %>%
mutate(cluster=as.character(cluster),
cluster=ifelse(cluster %in% names(clustRemap), clustRemap[cluster], cluster)
)
}
# Create a blank state map with black lines
blankStates <- usmap::plot_usmap("states")
# Create a county cluster map with NA values excluded
dataCounties <- df %>%
filter(!is.na(cluster)) %>%
usmap::plot_usmap(regions="counties", data=., values="cluster")
# Integrate as a ggplot object
p1 <- ggplot() +
geom_polygon(data=dataCounties[[1]],
aes(x=x, y=y, group=group, fill=dataCounties[[1]]$cluster),
color = NA,
size = 0.1
) +
geom_polygon(data=blankStates[[1]],
aes(x=x, y=y, group=group),
color = "black",
lwd=1.25,
fill = alpha(0.001)
) +
coord_equal() +
ggthemes::theme_map()
# Add the appropriate fill (can use viridis_d also)
if (is.null(brewPalette)) p1 <- p1 + scale_fill_discrete("Cluster", na.value=naFill)
else if (brewPalette=="viridis") p1 <- p1 + scale_fill_viridis_d("Cluster", na.value=naFill)
else p1 <- p1 + scale_fill_brewer("Cluster", palette=brewPalette, na.value=naFill)
# Add caption if requested
if (!is.null(caption)) p1 <- p1 + labs(caption=caption)
# Return the plotting object
p1
}
The function is then run using previously created segments:
sparseCountyClusterMap(cty_newformat_20201026$useClusters,
brewPalette="Paired",
caption="Counties with population under 25k are blank"
)
Next steps are to load new data and compare against previous, while using existing segments:
readList <- list("usafCase"="./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20210608.csv",
"usafDeath"="./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20210608.csv"
)
compareList <- list("usafCase"=cty_newformat_20201026$dfRaw$usafCase,
"usafDeath"=cty_newformat_20201026$dfRaw$usafDeath
)
# Create new clusters
cty_newdata_20210608 <- readRunUSAFacts(maxDate="2021-06-06",
downloadTo=lapply(readList,
FUN=function(x) if(file.exists(x)) NA else x
),
readFrom=readList,
compareFile=compareList,
writeLog="./RInputFiles/Coronavirus/USAF_NewData_20210608_v005.log",
ovrwriteLog=TRUE,
useClusters=cty_newformat_20201026$useClusters,
skipAssessmentPlots=FALSE,
brewPalette="Paired"
)
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character(),
## StateFIPS = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 225
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20210608_v005.log
##
## Checking for similarity of: county
## In reference but not in current: 00001 02270 06000
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 14 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210608_v005.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 551 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210608_v005.log
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character(),
## StateFIPS = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 225
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20210608_v005.log
##
## Checking for similarity of: county
## In reference but not in current: 00001 02270 06000
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 29 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210608_v005.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 640 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210608_v005.log
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
## isType cases new_cases n
## <chr> <dbl> <dbl> <dbl>
## 1 before 6.20e+9 3.28e+7 1602886
## 2 after 6.18e+9 3.27e+7 1577284
## 3 pctchg 4.58e-3 3.84e-3 0.0160
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
## isType deaths new_deaths n
## <chr> <dbl> <dbl> <dbl>
## 1 before 1.26e+8 590404 1602886
## 2 after 1.25e+8 589015 1577284
## 3 pctchg 3.75e-3 0.00235 0.0160
## NULL
sparseCountyClusterMap(cty_newdata_20210608$useClusters,
brewPalette="Paired",
caption="Counties with population under 25k are blank"
)
There has been significant convergence among segments in average deaths per million and cases per million. This is suggestive of several possibilities, such as that growth in burden may be inversely proportional to previous burden. Next steps are to create segments using the most recent data, seeking to identify differences in 1) cumulative burden, primarily defined by deaths, and 2) shape of the curve in getting to cumulative burden.
New segments are created, with assessments:
readList <- list("usafCase"="./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20210608.csv",
"usafDeath"="./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20210608.csv"
)
# Create new clusters
cty_newsegs_20210608 <- readRunUSAFacts(maxDate="2021-06-06",
writeLog="./RInputFiles/Coronavirus/USAF_NewSegs_20210608_v005.log",
ovrwriteLog=TRUE,
dfPerCapita=cty_newdata_20210608$dfPerCapita,
useClusters=NULL,
skipAssessmentPlots=FALSE,
brewPalette="Paired",
defaultCluster="999",
minPopCluster=40000,
hierarchical=NA,
minShape="2020-04",
maxShape="2021-05",
ratioDeathvsCase = 5,
ratioTotalvsShape = 0.5,
minDeath=100,
minCase=5000,
hmlSegs=3,
eslSegs=3,
seed=2106091243
)
##
## *** File has been checked for uniqueness by: countyFIPS date
## NULL
sparseCountyClusterMap(cty_newsegs_20210608$useClusters,
brewPalette="Paired",
caption="Counties with population under 40k are blank"
)
The function createSummary() is updated to 1) create the sparse summary map; and 2) exclude the small county segment from select plots:
# Function to create diagnoses and plots for clustering data
diagnoseClusters <- function(lst,
lstExtract=fullListExtract,
clusterFrame=NULL,
brewPalette=NA,
clusterType="state",
printSummary=TRUE,
printDetailed=TRUE
) {
# FUNCTION ARGUMENTS:
# lst: a list containing processed clustering data
# lstExtract: the elements to extract from lst with an optional function for converting the elements
# NULL means use the extracted element as-is
# clusterFrame: tibble of the clusters to be plotted
# NULL means create from lst
# brewPalette: the color palette to use with scale_*_brewer()
# default NA means use the standard color/fill profile
# clusterType: character variable of form "state" for state clusters and "county" for county
# printSummary: boolean, should summary plots be printed to the log?
# printDetailed: boolean, should detailed plots be printed to the log?
# Get the key variable (used for joins and the like)
if (clusterType=="state") keyVar <- "state"
else if (clusterType=="county") keyVar <- "countyFIPS"
else stop(paste0("\nThe passed clusterType: ", clusterType, " is not programmed\n"))
# Create clusterFrame from lst if it has been passed as NULL
if (is.null(clusterFrame)) clusterFrame <- clustersToFrame(lst, colNameName=keyVar)
# Create the integrated and aggregate data from lst
dfFull <- integrateData(lst, lstExtract=lstExtract, otherDF=list(clusterFrame), keyJoin=keyVar)
dfAgg <- combineAggData(dfFull, aggBy=plotCombineAggByMapper[[clusterType]])
# Create the main summary plots
summaryPlots <- createSummary(dfAgg,
stateClusterDF=clusterFrame,
brewPalette=brewPalette,
dataType=clusterType
)
# Create the detailed summaries
detPlots <- createDetailedSummaries(dfDetail=dfFull,
dfAgg=dfAgg,
detVar=keyVar,
p2DetMetrics=plotCombineAggByMapper[[clusterType]]$agg2$aggVars,
brewPalette=brewPalette
)
# Print the summary plots if requested (use the sparse county plotting)
if (isTRUE(printSummary)) {
gridExtra::grid.arrange(summaryPlots$p5 + theme(legend.position="none"),
summaryPlots$p3 + theme(legend.position="left"),
summaryPlots$p4,
layout_matrix=rbind(c(1, 2),
c(3, 3)
)
)
}
# Print the detailed plots if requested
if (isTRUE(printDetailed)) purrr::walk(detPlots, .f=print)
# Return a list of the key plotting files
list(dfFull=dfFull,
dfAgg=dfAgg,
plotClusters=clusterFrame,
summaryPlots=summaryPlots,
detPlots=detPlots
)
}
# Function to create detailed cluster summaries
createDetailedSummaries <- function(dfDetail,
dfAgg,
aggVar=c("cluster"),
detVar=c("state"),
p1Metrics=c("tcpm", "tdpm"),
p1Order=c("tdpm"),
p2DetMetrics=c("tcpm7", "tdpm7", "cpm7", "dpm7", "hpm7"),
p2AggMetrics=paste0("wm_", p2DetMetrics),
p3Metrics=p1Metrics,
p3Days=30,
p3Slope=0.25,
mapper=c("tcpm"="Cases per million\n(cumulative)",
"tdpm"="Deaths per million\n(cumulative)",
"cpm7"="Cases\nper million",
"dpm7"="Deaths\nper million",
"hpm7"="Hospitalized\nper million",
"tdpm7"="Deaths (cum)\nper million",
"tcpm7"="Cases (cum)\nper million"
),
brewPalette=NA
) {
# FUNCTION ARGUMENTS:
# dfDetail: data frame or tibble containing detailed (sub-cluster) data
# dfAgg: data frame or tibble containing aggregated (cluster) data
# aggVar: variable reflecting the aggregate level
# detVar: variable reflecting the detailed level
# p1Metrics: metrics to be shown for plot 1 (will be faceted)
# p1Order: variable for ordering detVar in p1
# p2DetMetrics: variables to be included from dfDetail for p2
# p2AggMetrics: corresponding variables to be included from dfAgg for p2
# p3Metrics: metrics to be included in the growth plots
# p3Days: number of days to include for calculating growth
# p3Slope: the slope for the dashed line for growth in p3
# mapper mapping file for variable name to descriptive name
# brewPalette: character string for a palette from RColorBrewer to be used (NA means default colors)
# Create plot for aggregates by sub-cluster
if(detVar=="state") {
p1 <- dfDetail %>%
colSelector(vecSelect=c(detVar, aggVar, "date", p1Metrics)) %>%
pivot_longer(all_of(p1Metrics)) %>%
filter(!is.na(value)) %>%
group_by_at(detVar) %>%
filter(date==max(date)) %>%
ungroup() %>%
ggplot(aes(x=fct_reorder(get(detVar),
value,
.fun=function(x) { max(ifelse(name==p1Order, x, 0)) }
),
y=value
)
) +
geom_col(aes(fill=get(aggVar))) +
facet_wrap(~mapper[name], scales="free_x") +
coord_flip() +
labs(title="Cumulative burden", x=NULL, y=NULL)
if (!is.na(brewPalette)) p1 <- p1 + scale_fill_brewer("Cluster", palette=brewPalette)
} else {
# Do not create the plot for other than state-level data
p1 <- NULL
}
# Create facetted burden trends by aggregate
# For state-level, create each state as a line
# For anything else, create a 10%-90% range
if (detVar=="state") {
p2 <- dfDetail %>%
colSelector(vecSelect=c(detVar, "date", aggVar, p2DetMetrics)) %>%
pivot_longer(all_of(p2DetMetrics)) %>%
filter(!is.na(value)) %>%
ggplot(aes(x=date, y=value)) +
geom_line(aes_string(group=detVar), color="grey", size=0.5) +
facet_grid(mapper[name] ~ get(aggVar), scales="free_y") +
scale_x_date(date_breaks="2 months", date_labels="%b-%y") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
labs(x=NULL, y=NULL, title="Burden by cluster and metric")
} else {
p2 <- dfDetail %>%
colSelector(vecSelect=c(detVar, "date", aggVar, p2DetMetrics)) %>%
pivot_longer(all_of(p2DetMetrics)) %>%
filter(!is.na(value)) %>%
group_by_at(c(aggVar, "name", "date")) %>%
summarize(p10=unname(quantile(value, 0.1)), p90=unname(quantile(value, 0.9)), .groups="drop") %>%
ggplot(aes(x=date)) +
geom_ribbon(aes(ymin=p10, ymax=p90), alpha=0.75) +
facet_grid(mapper[name] ~ get(aggVar), scales="free_y") +
scale_x_date(date_breaks="2 months", date_labels="%b-%y") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
labs(x=NULL,
y=NULL,
title="Burden by cluster and metric",
subtitle="Shaded region is 10%le through 90%le, solid line is weighted mean"
)
}
aggPlot <- dfAgg %>%
colSelector(vecSelect=c("date", aggVar, p2AggMetrics)) %>%
colRenamer(vecRename=purrr::set_names(p2DetMetrics, p2AggMetrics)) %>%
pivot_longer(all_of(p2DetMetrics)) %>%
filter(!is.na(value))
p2 <- p2 +
geom_line(data=aggPlot, aes_string(color=aggVar, group=aggVar, y="value"), size=1.5)
if (!is.na(brewPalette)) p2 <- p2 +
scale_color_brewer("Cluster", palette=brewPalette) +
theme(legend.position="none")
# Create growth trends plot
if (TRUE) {
if(detVar=="countyFIPS") p3 <- dfDetail %>% filter(cluster != "999") else p3 <- dfDetail
p3 <- p3 %>%
colSelector(vecSelect=c(detVar, aggVar, "date", p3Metrics)) %>%
pivot_longer(all_of(p3Metrics)) %>%
filter(!is.na(value)) %>%
group_by_at(c(detVar, "name")) %>%
filter(date %in% c(max(date), max(date)-lubridate::days(p3Days))) %>%
mutate(growth=max(value)-min(value)) %>% # not ideal way to calculate
filter(date==max(date)) %>%
ungroup() %>%
ggplot(aes(x=value, y=growth))
if(detVar=="state") p3 <- p3 + geom_text(aes_string(color=aggVar, label=detVar), fontface="bold")
else p3 <- p3 + geom_point(aes_string(color=aggVar))
p3 <- p3 +
facet_wrap(~mapper[name], scales="free") +
labs(title=paste0("Current vs growth"),
subtitle=paste0("Dashed line represents ",
round(100*p3Slope),
"% growth rate over past ",
p3Days,
" days"
),
x="Most recent cumulative",
y=paste0("Growth over past ", p3Days, " days")
) +
lims(y=c(0, NA), x=c(0, NA)) +
theme(panel.background = element_rect(fill = "white", colour = "white"),
panel.grid.major = element_line(size = 0.25, linetype = 'solid', color = "grey")
) +
geom_abline(slope=p3Slope, intercept=0, lty=2)
if (!is.na(brewPalette)) {
p3 <- p3 + scale_color_brewer(stringr::str_to_title(aggVar), palette=brewPalette)
}
if(detVar=="countyFIPS") p3 <- p3 + labs(caption="Counties below population threshold excluded")
} else {
p3 <- NULL
}
# Return a list of plot objects
list(p1=p1, p2=p2, p3=p3)
}
# Updated function for handling county-level clusters
createSummary <- function(df,
stateClusterDF=NULL,
brewPalette=NA,
dataType="state"
) {
# FUNCTION ARGUMENTS:
# df: an integrated data frame by cluster-date
# stateClusterDF: a data frame containing state-cluster (NULL means it can be found in df)
# brewPalette: character string for a palette from RColorBrewer to be used (NA means default colors)
# dataType: the type of maps being produced ("state" or "county")
# Create plots that can be relevant for a dashboard, including:
# 1. Map of segments
# 2. Bar plot of counts by segment
# 3. Facetted bar plot of segment descriptors (e.g., population, burden per million)
# 4. Facetted trend-line plot of burden by segments
# 5. Sparse county cluster map (if dataType is "county")
# Create a map of the clusters
p1 <- helperSummaryMap(if(is.null(stateClusterDF)) df else stateClusterDF,
mapLevel=if(dataType=="state") "states" else "counties",
keyCol=if(dataType=="state") "state" else "countyFIPS",
discreteValues=TRUE,
labelScale=is.na(brewPalette),
textLabel=if(dataType=="state") c("RI", "CT", "DE", "MD", "DC") else c(),
extraArgs=if(is.na(brewPalette)) list() else
list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
)
# Create a bar plot of counts by segment
p2 <- helperSummaryMap(if(is.null(stateClusterDF)) df else stateClusterDF,
mapLevel=if(dataType=="state") "states" else "counties",
keyCol=if(dataType=="state") "state" else "countyFIPS",
discreteValues=TRUE,
labelScale=is.na(brewPalette),
countOnly=TRUE,
extraArgs=if(is.na(brewPalette)) list() else
list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
)
# Create plot for population and burden by cluster
p3 <- df %>%
helperAggTotal(aggVars=c("pop", "wm_tcpm7", "wm_tdpm7"),
mapper=c("pop"="Population (millions)",
"wm_tcpm7"="Cases per thousand",
"wm_tdpm7"="Deaths per million"
),
xLab=NULL,
yLab=NULL,
title=NULL,
divideBy=c("pop"=1000000, "wm_tcpm7"=1000),
extraArgs=if(is.na(brewPalette)) list() else
list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
)
# Create plot for cumulative burden per million over time
p4xtra <- list(arg1=scale_x_date(date_breaks="2 months", date_labels="%b-%y"),
arg2=theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
)
if(!is.na(brewPalette)) p4xtra$arg3 <- scale_color_brewer("Cluster", palette=brewPalette)
p4 <- df %>%
helperAggTrend(aggVars=append(c("wm_tcpm7", "wm_tdpm7"), if(dataType=="state") "wm_hpm7" else NULL),
mapper=c("wm_tcpm7"="Cases per thousand\n(cumulative)",
"wm_tdpm7"="Deaths per million\n(cumulative)",
"wm_hpm7"="Hospitalized per million\n(current)"
),
yLab=NULL,
title=NULL,
divideBy=c("wm_tcpm7"=1000),
linesize=0.75,
extraArgs=p4xtra
)
# Create the sparse county plot (if it is county data) - assumes default that '999' is 'too small'
if (dataType=="county") {
vecFrame <- if(is.null(stateClusterDF)) df else stateClusterDF
vecFrame <- vecFrame %>% select(countyFIPS, cluster) %>%
distinct()
vecUse <- vecFrame$cluster %>% purrr::set_names(vecFrame$countyFIPS)
p5 <- sparseCountyClusterMap(vecUse,
caption="Counties below population threshold left blank",
brewPalette=if(is.na(brewPalette)) NULL else brewPalette
)
} else {
p5 <- NULL
}
list(p1=p1, p2=p2, p3=p3, p4=p4, p5=p5)
}
The updated functions are tested:
# Confirm that new cluster maps are working as intended
cty_chksegs_20210608 <- readRunUSAFacts(maxDate="2021-06-06",
dfPerCapita=cty_newsegs_20210608$dfPerCapita,
useClusters=cty_newsegs_20210608$useClusters,
skipAssessmentPlots=FALSE,
brewPalette="Paired"
)
## NULL
The updated maps blank out the ‘999’ (small population) cluster as desired. The clusters are designed using a 3x3 of total burden (mainly death) and shape of the curve. Shapes of the curve are plotted, normalized so that each curve sums to 100%:
p1Data <- cty_chksegs_20210608$plotDataList$dfAgg %>%
pivotData(pivotKeys=c("cluster", "date", "pop")) %>%
filter(!is.na(value), cluster!="999", name %in% c("wm_cpm7", "wm_dpm7")) %>%
group_by(cluster, name) %>%
mutate(pct=value/sum(value)) %>%
ungroup()
# Plot all together
p1Data %>%
ggplot(aes(x=date, y=pct)) +
geom_line(aes(group=cluster, color=cluster), lwd=1) +
facet_wrap(~c("wm_cpm7"="Rolling 7 cases", "wm_dpm7"="Rolling 7 deaths")[name]) +
scale_color_brewer(palette="Paired") +
labs(x=NULL,
y="Percentage of total burden",
main="Burden over time, normalized to 100% cumulative burden"
)
# Plot as facets
p1Data %>%
ggplot(aes(x=date, y=pct)) +
geom_line(aes(group=cluster, color=cluster), lwd=1) +
facet_grid(c("wm_cpm7"="Rolling 7 cases", "wm_dpm7"="Rolling 7 deaths")[name]~cluster) +
scale_color_brewer(palette="Paired") +
labs(x=NULL,
y="Percentage of total burden",
main="Burden over time, normalized to 100% cumulative burden"
)
Broadly, the shapes selected by the clustering include:
Broadly, there are examples of relatively higher and lower burden within each of these shapes, though the highest burden clusters mainly had two spikes with primary impact early and the medium burden clusters rarely had the primary impact early.
Next, all counties of at least 50,000 population are examined for the shape of the cumulative deaths curve:
p2Data <- cty_chksegs_20210608$plotDataList$dfFull %>%
select(countyFIPS, pop, state, date, tdpm7, tcpm7) %>%
pivotData(pivotKeys=c("countyFIPS", "state", "date", "pop")) %>%
filter(!is.na(value), pop>=50000) %>%
group_by(countyFIPS, name) %>%
mutate(pct=value/max(value)) %>%
ungroup()
p2Dist <- p2Data %>%
filter(name=="tdpm7") %>%
select(countyFIPS, date, pct) %>%
pivotData(pivotKeys="countyFIPS", toLonger = FALSE, nameVar="date", valVar="pct") %>%
column_to_rownames("countyFIPS") %>%
as.matrix() %>%
dist()
p2Complete <- hclust(p2Dist, method="complete")
plot(p2Complete)
p2Single <- hclust(p2Dist, method="single")
plot(p2Single)
There is something peculiar about data in ‘32510’. A plot is created:
p2Data %>%
ggplot(aes(x=date, y=pct)) +
geom_line(data=~filter(., countyFIPS=="32510"), aes(group=countyFIPS), color="red", lwd=2) +
geom_line(data=~summarize(group_by(., name, date), pct=mean(pct), .groups="drop")) +
facet_wrap(~c("tcpm7"="Rolling 7 cases", "tdpm7"="Rolling 7 deaths")[name]) +
labs(x=NULL, y="Percentage of total burden on day", title="Cluster 32510 is red, mean is black")
There is clearly a data issue with cluster 32510. Clusters are examined at a variable number of cut points:
plotHierarchical <- function(obj, k, df) {
# FUNCtION ARGuMENTS:
# obj: clustering object from hierarchical clustering
# k: number of clusters to create
# df: data frame with burden data
# Counts by cluster
ctCluster <- cutree(obj, k=k) %>%
table() %>%
print()
# Make clustering data frame
clData <- cutree(obj, k=k) %>%
clustersToFrame(colNameName="countyFIPS") %>%
joinFrames(df) %>%
group_by(countyFIPS, name) %>%
mutate(delta=ifelse(row_number()==1, pct, pct-lag(pct))) %>%
ungroup() %>%
group_by(cluster, date, name) %>%
summarize(p05cum=quantile(pct, 0.05),
mucum=mean(pct),
p95cum=quantile(pct, 0.95),
p05delta=quantile(delta, 0.05),
mudelta=mean(delta),
p95delta=quantile(delta, 0.95),
.groups="drop"
)
# Plot by cluster and metric
p1 <- clData %>%
ggplot(aes(x=date)) +
geom_line(aes(color=cluster, y=mucum), lwd=1) +
geom_ribbon(aes(ymin=p05cum, ymax=p95cum), alpha=0.25) +
facet_grid(c("tcpm7"="% cumulative cases", "tdpm7"="% cumulative deaths")[name]~cluster) +
scale_color_brewer(palette="Set1") +
labs(x=NULL, y="% of cumulative", title="Mean and 90% range by cluster and metric")
print(p1)
# Plot incremental rather than cumulative
p2 <- clData %>%
ggplot(aes(x=date)) +
geom_line(aes(color=cluster, y=mudelta), lwd=1) +
geom_ribbon(aes(ymin=p05delta, ymax=p95delta), alpha=0.25) +
facet_grid(c("tcpm7"="% total cases", "tdpm7"="% total deaths")[name]~cluster) +
scale_color_brewer(palette="Set1") +
labs(x=NULL, y="% of total", title="Mean and 90% range by cluster and metric")
print(p2)
# Return the clustering frame
clData
}
plotList <- lapply(2:5, FUN=function(x) plotHierarchical(p2Complete, k=x, df=p2Data))
## .
## 1 2
## 841 149
## Joining, by = "countyFIPS"
## .
## 1 2 3
## 773 68 149
## Joining, by = "countyFIPS"
## .
## 1 2 3 4
## 498 275 68 149
## Joining, by = "countyFIPS"
## .
## 1 2 3 4 5
## 498 275 68 114 35
## Joining, by = "countyFIPS"
At a glance, with 5 clusters, there is a clear early cluster, a clear early/late cluster, and a group of three seemingly similar clusters (primarily late). Further exploration of these three clusters may be merited.
The five clusters are examined on a single plot:
tempPlotter <- function(df, vecLate, facetScales="free_y", showRibbon=TRUE, showCum=FALSE) {
p1 <- df %>%
mutate(lateCluster=(cluster %in% vecLate)) %>%
ggplot(aes(x=date)) +
geom_line(aes(group=cluster, color=cluster, y=if(showCum) mucum else mudelta), lwd=1) +
facet_grid(lateCluster~c("tcpm7"="% total cases", "tdpm7"="% total deaths")[name],
scales=facetScales
) +
scale_color_brewer(palette="Set1") +
labs(x=NULL,
y=NULL,
title="Disease burden by time period and shape cluster"
)
if (isTRUE(showRibbon)) {
p1 <- p1 +
geom_ribbon(aes(ymin=if(showCum) p05cum else p05delta,
ymax=if(showCum) p95cum else p95delta,
fill=cluster,
group=cluster
), alpha=0.25) +
scale_fill_brewer(palette="Set1") +
labs(subtitle="Shaded region covers 90% of observations")
}
p1
}
tempPlotter(plotList[[4]], vecLate=c(4, 5), showRibbon=FALSE)
tempPlotter(plotList[[4]], vecLate=c(4, 5), showRibbon=FALSE, showCum=TRUE)
tempPlotter(plotList[[4]], vecLate=c(4, 5))
tempPlotter(plotList[[4]], vecLate=c(4, 5), showCum=TRUE)
Visually, there are meaningful distinctions in the clusters when overlaid, particularly with the shape of the cumulative curve:
The geographic locations of the five hierarchical cluster types are plotted:
tempMapper <- function(obj,
k,
regions="counties",
varName=if(regions=="counties") "fips" else "state",
values="cluster",
lvlOrder=NULL,
title=if(is.null(lvlOrder)) NULL else "Lightest colors have earliest burden",
caption=NULL
) {
df <- cutree(obj, k=k) %>%
clustersToFrame(colNameName=varName)
if (!is.null(lvlOrder)) df[[values]] <- factor(df[[values]], levels=lvlOrder)
df %>%
sparseCountyClusterMap(brewPalette="viridis", caption=caption) +
labs(title=title)
}
tmpText <- "Plots only counties with at least 50k population"
tempMapper(p2Complete, k=2, lvlOrder=c(1, 2), caption=tmpText)
tempMapper(p2Complete, k=3, lvlOrder=c(2, 1, 3), caption=tmpText)
tempMapper(p2Complete, k=4, lvlOrder=c(3, 2, 1, 4), caption=tmpText)
tempMapper(p2Complete, k=5, lvlOrder=c(3, 2, 1, 4, 5), caption=tmpText)
Maps are created to show when counties first cleared specific hurdles for cumulative deaths per million:
tempBurdenMapper <- function(df,
popMin=1,
popMax=Inf,
tgtVar="tdpm",
tgtBPM=1000,
fn=lubridate::quarter,
title=paste0("Time when county first hit ", tgtBPM, " on metric ", tgtVar),
caption=paste("Plots counties with population between", popMin, "and", popMax),
...
) {
p1 <- df %>%
filter(pop < popMax, pop >= popMin) %>%
group_by(countyFIPS) %>%
summarize(bpmMax=max(get(tgtVar), na.rm=TRUE),
keyDate=as.Date(specNA(min)(ifelse(get(tgtVar) >= tgtBPM, date, NA)), origin="1970-01-01"),
.groups="drop"
) %>%
mutate(cluster=fn(keyDate, ...),
cluster=ifelse(is.na(cluster), "not yet", cluster),
fips=countyFIPS
) %>%
sparseCountyClusterMap(brewPalette = "viridis", caption=caption)
p1 + labs(title=title)
}
# Looking at tdpm of 500, 1000, 5000
tempBurdenMapper(cty_chksegs_20210608$plotDataList$dfFull, tgtBPM=500, with_year=TRUE)
tempBurdenMapper(cty_chksegs_20210608$plotDataList$dfFull, tgtBPM=1000, with_year=TRUE)
tempBurdenMapper(cty_chksegs_20210608$plotDataList$dfFull, tgtBPM=5000, with_year=TRUE)
# Looking specifically at 2500 tdpm
tempBurdenMapper(cty_chksegs_20210608$plotDataList$dfFull, tgtBPM=2500, with_year=TRUE)
tempBurdenMapper(cty_chksegs_20210608$plotDataList$dfFull, popMin=25000, tgtBPM=2500, with_year=TRUE)
tempBurdenMapper(cty_chksegs_20210608$plotDataList$dfFull, popMax=25000, tgtBPM=2500, with_year=TRUE)
Next steps are to create curves using larger counties, then assess how well they fit the shape of the curves in the smaller counties.
A function is created to calculate the distances from each of the cluster means:
tempDistCluster <- function(dfClust, dfAll, metType="tdpm7", metClust="mucum") {
dfClust <- dfClust %>%
filter(name==metType) %>%
colSelector(vecSelect=c("cluster", "date", metClust)) %>%
colRenamer(vecRename=c("pct") %>% purrr::set_names(metClust))
dfAll <- dfAll %>%
filter(name==metType) %>%
colSelector(vecSelect=c("countyFIPS", "date", "pct"))
dfClust %>%
left_join(dfAll, by="date") %>%
mutate(delta2=(ifelse(is.na(pct.x), 0, pct.x)-ifelse(is.na(pct.y), 0, pct.y))**2) %>%
group_by(cluster, countyFIPS) %>%
summarize(dist=sum(delta2), .groups="drop") %>%
arrange(countyFIPS, dist)
}
# Create data for all counties
p2All <- cty_chksegs_20210608$plotDataList$dfFull %>%
select(countyFIPS, pop, state, date, tdpm7, tcpm7) %>%
pivotData(pivotKeys=c("countyFIPS", "state", "date", "pop")) %>%
filter(!is.na(value), pop>0) %>%
group_by(countyFIPS, name) %>%
mutate(pct=value/max(value)) %>%
ungroup()
# Calculate distances to cluster mean, then print
countyClusterDist <- tempDistCluster(dfClust=plotList[[4]], dfAll=p2All)
countyClusterDist
## # A tibble: 15,710 x 3
## cluster countyFIPS dist
## <ord> <chr> <dbl>
## 1 1 01001 1.20
## 2 2 01001 3.10
## 3 3 01001 5.48
## 4 4 01001 14.8
## 5 5 01001 48.8
## 6 2 01003 0.938
## 7 1 01003 1.88
## 8 3 01003 7.53
## 9 4 01003 19.3
## 10 5 01003 58.0
## # ... with 15,700 more rows
# Assess overlap of cluster for counties already clusterd
tmpDistDF <- cutree(p2Complete, k=5) %>%
vecToTibble(colNameName="countyFIPS", colNameData="hierCluster") %>%
left_join(countyClusterDist, by="countyFIPS") %>%
arrange(countyFIPS, dist)
tmpDistDF %>%
group_by(countyFIPS) %>%
filter(row_number()==1) %>%
ungroup() %>%
mutate(cluster=as.integer(as.character(cluster))) %>%
count(hierCluster, cluster) %>%
group_by(hierCluster) %>%
mutate(pct=n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=cluster, y=hierCluster)) +
geom_tile(aes(fill=pct)) +
geom_text(aes(label=paste0("n=", n, "\n(", round(100*pct), "%)"))) +
scale_fill_continuous(low="white", high="lightgreen") +
labs(title="Overlap of assigned hierarchical cluster and closest hierarchical cluster mean",
x="Closest hierarchical cluster mean (k=5)",
y="Assigned hierarchical cluster (k=5)"
)
Most counties are closest to the mean of the hierarchical cluster they are assigned to. There are some differences, as expected since method=“complete” was chosen for the hierarchical clusters. Assignments are then made for all counties, and then plotted:
# Full clustering database
distOnlyClust <- countyClusterDist %>%
arrange(countyFIPS, dist) %>%
group_by(countyFIPS) %>%
filter(row_number()==1) %>%
ungroup() %>%
left_join(cutree(p2Complete, k=5) %>%
vecToTibble(colNameName="countyFIPS", colNameData="hierCluster"),
by="countyFIPS"
)
# Cluster assignments depending on size of county
distOnlyClust %>%
ggplot(aes(x=cluster)) +
geom_bar(fill="lightblue") +
geom_text(stat="count", aes(y=..count../2, label=..count..)) +
facet_wrap(~ifelse(is.na(hierCluster), "Not in hierarchical data", "In hierarchical data"),
scales="free_y"
) +
labs(title="Closest hierarchical cluster")
# Plot all counties based on closest cluster
sparseCountyClusterMap(rename(distOnlyClust, fips=countyFIPS) %>%
mutate(cluster=factor(cluster, levels=c(3, 2, 1, 4, 5))),
brewPalette="viridis",
caption="Closest hierarchical cluster mean (euclidean distance of % cumulative death)"
) +
labs(title="Lighter clusters have higher percentage of burden early")
There are clear geographic patterns to the data, with counties having earlier disease generally being surrounded by other counties with (relatively) earlier disease.
The latest burden is also included:
p2Merge <- p2All %>%
filter(date==max(date), name=="tdpm7") %>%
select(countyFIPS, pop, state, date, tdpm7=value) %>%
inner_join(distOnlyClust, by=c("countyFIPS"))
p2Merge %>%
mutate(countySize=ifelse(pop>=50000, ">50k", "<50k")) %>%
ggplot(aes(x=tdpm7)) +
geom_density(aes(color=cluster), size=1) +
facet_wrap(~countySize) +
scale_color_brewer(palette="Set1") +
labs(x="Cumulative deaths per million", y="Relative density", title="Deaths vs shape segments")
Smaller counties are more likely to be very low or very high on deaths per million, as expected given the smaller denominators. Among the larger counties, segment 5 appears to generally have higher deaths, a trend that does not appear to hold in smaller counties.
ECDF curves are created as well:
p2Merge %>%
mutate(countySize=ifelse(pop>=50000, ">50k", "<50k")) %>%
arrange(cluster, countySize, tdpm7) %>%
group_by(cluster, countySize) %>%
mutate(prop=row_number()/n()) %>%
ungroup() %>%
ggplot(aes(x=tdpm7, y=prop)) +
geom_line(aes(group=countySize, color=countySize), size=1) +
facet_wrap(~cluster) +
scale_color_brewer(palette="Set1") +
labs(x="Cumulative deaths per million",
y="ECDF",
title="Deaths vs shape segments",
subtitle="Includes all counties (even with 0 deaths)"
)
p2Merge %>%
mutate(countySize=ifelse(pop>=50000, ">50k", "<50k")) %>%
filter(tdpm7>0) %>%
arrange(cluster, countySize, tdpm7) %>%
group_by(cluster, countySize) %>%
mutate(prop=row_number()/n()) %>%
ungroup() %>%
ggplot(aes(x=tdpm7, y=prop)) +
geom_line(aes(group=cluster, color=cluster), size=1) +
facet_wrap(~countySize) +
scale_color_brewer(palette="Set1") +
labs(x="Cumulative deaths per million",
y="ECDF",
title="Deaths vs shape segments",
subtitle="Includes only counties with 1+ deaths"
)
Larger counties appear to generally have had lower burdens, with the exception of very low burden, where there is some crossing of the curves at around 500 deaths per million.
Suppose that counties are clustered based on cumulative deaths (not percentage) by month, with focus on larger counties first:
# Include all counties with population greater than 0
p2DataAll <- cty_chksegs_20210608$plotDataList$dfFull %>%
select(countyFIPS, pop, state, date, tdpm7, tcpm7) %>%
pivotData(pivotKeys=c("countyFIPS", "state", "date", "pop")) %>%
filter(!is.na(value), pop>=1) %>%
group_by(countyFIPS, name) %>%
mutate(pct=value/max(value)) %>%
ungroup()
# Create distance matrix and hierarchical clusters
p2DistValue100k <- p2DataAll %>%
filter(name=="tdpm7", pop>=100000) %>%
select(countyFIPS, date, value) %>%
pivotData(pivotKeys="countyFIPS", toLonger = FALSE, nameVar="date", valVar="value") %>%
column_to_rownames("countyFIPS") %>%
as.matrix() %>%
dist()
# Run clustering with method "complete"
p2CompleteValue100k <- hclust(p2DistValue100k, method="complete")
plot(p2CompleteValue100k)
There appears to be a branch of counties that are unique:
cutree(p2CompleteValue100k, k=2) %>%
vecToTibble(colNameName="countyFIPS", colNameData="cluster") %>%
group_by(cluster) %>%
mutate(n=n()) %>%
ungroup() %>%
filter(n==min(n)) %>%
left_join(p2DataAll, by="countyFIPS") %>%
left_join(select(getCountyData(), countyFIPS, countyName, state)) %>%
filter(name=="tdpm7") %>%
mutate(countyName=paste0(countyFIPS, " - ",
stringr::str_replace(countyName, " County", ""),
" (",
state,
")"
)
) %>%
ggplot(aes(x=date, y=value)) +
geom_line() +
facet_wrap(~countyName) +
labs(x=NULL, y="Deaths per million", title="Counties in the first, small segment")
## Joining, by = c("countyFIPS", "state")
These counties have high total burdens combined with shapes differing from the norm. This leads them to become their own clusters, requiring a greater number of clusters to get at the main groupings:
# Updated to allow for variable other than pct
plotHierarchical <- function(obj, k, df, keyVar="pct", facetScale="fixed") {
# FUNCtION ARGuMENTS:
# obj: clustering object from hierarchical clustering
# k: number of clusters to create
# df: data frame with burden data
# keyVar: the key variable to use in df
# Counts by cluster
ctCluster <- cutree(obj, k=k) %>%
table() %>%
print()
# Make clustering data frame
clData <- cutree(obj, k=k) %>%
clustersToFrame(colNameName="countyFIPS") %>%
joinFrames(df) %>%
group_by(countyFIPS, name) %>%
mutate(delta=ifelse(row_number()==1, get(keyVar), get(keyVar)-lag(get(keyVar)))) %>%
ungroup() %>%
group_by(cluster, date, name) %>%
summarize(p05cum=quantile(get(keyVar), 0.05),
mucum=mean(get(keyVar)),
p95cum=quantile(get(keyVar), 0.95),
p05delta=quantile(delta, 0.05),
mudelta=mean(delta),
p95delta=quantile(delta, 0.95),
.groups="drop"
)
# Plot by cluster and metric
p1 <- clData %>%
ggplot(aes(x=date)) +
geom_line(aes(color=cluster, y=mucum), lwd=1) +
geom_ribbon(aes(ymin=p05cum, ymax=p95cum), alpha=0.25) +
facet_grid(c("tcpm7"="Cumulative cases", "tdpm7"="Cumulative deaths")[name]~cluster,
scales=facetScale
) +
scale_color_brewer(palette="Set1") +
labs(x=NULL, y="Cumulative", title="Mean and 90% range by cluster and metric")
print(p1)
# Plot incremental rather than cumulative
p2 <- clData %>%
ggplot(aes(x=date)) +
geom_line(aes(color=cluster, y=mudelta), lwd=1) +
geom_ribbon(aes(ymin=p05delta, ymax=p95delta), alpha=0.25) +
facet_grid(c("tcpm7"="Cases", "tdpm7"="Deaths")[name]~cluster, scales=facetScale) +
scale_color_brewer(palette="Set1") +
labs(x=NULL, y="Burden", title="Mean and 90% range by cluster and metric")
print(p2)
# Return the clustering frame
clData
}
plotListValue100k <- lapply(2:9,
FUN=function(x) plotHierarchical(p2CompleteValue100k,
k=x,
df=filter(p2DataAll, pop>=100000),
keyVar="value",
facetScale="free_y"
)
)
## .
## 1 2
## 586 13
## Joining, by = "countyFIPS"
## .
## 1 2 3
## 319 267 13
## Joining, by = "countyFIPS"
## .
## 1 2 3 4
## 319 267 4 9
## Joining, by = "countyFIPS"
## .
## 1 2 3 4 5
## 319 236 4 31 9
## Joining, by = "countyFIPS"
## .
## 1 2 3 4 5 6
## 319 236 4 31 7 2
## Joining, by = "countyFIPS"
## .
## 1 2 3 4 5 6 7
## 319 81 155 4 31 7 2
## Joining, by = "countyFIPS"
## .
## 1 2 3 4 5 6 7 8
## 319 81 155 4 23 8 7 2
## Joining, by = "countyFIPS"
## .
## 1 2 3 4 5 6 7 8 9
## 319 81 144 4 23 11 8 7 2
## Joining, by = "countyFIPS"
With 9 segments, the following shapes and burdens (using deaths as the metric) are noted:
Separating counties with very high burden and very low burden appears reasonable. Clustering can be re-run with only the counties showing moderate disease burden, where shape is a more meaningful factor.
Segments are selected using k=8, with the four smallest buckets consolidated:
dfClust100k_k8 <- cutree(p2CompleteValue100k, k=8) %>%
vecToTibble(colNameData="tempCluster", colNameName="countyFIPS") %>%
group_by(tempCluster) %>%
mutate(cluster=factor(ifelse(n()>=20, tempCluster, 9))) %>%
ungroup()
dfClust100k_k8
## # A tibble: 599 x 3
## countyFIPS tempCluster cluster
## <chr> <int> <fct>
## 1 01003 1 1
## 2 01015 2 2
## 3 01055 2 2
## 4 01069 2 2
## 5 01073 2 2
## 6 01081 1 1
## 7 01089 1 1
## 8 01097 3 3
## 9 01101 2 2
## 10 01103 2 2
## # ... with 589 more rows
count(dfClust100k_k8, cluster)
## # A tibble: 5 x 2
## cluster n
## <fct> <int>
## 1 1 319
## 2 2 81
## 3 3 155
## 4 9 21
## 5 5 23
The plotHierarchical function is updated to allow for passing clusters to it directly:
# Updated to allow for variable other than pct and obj that is of class data.frame
plotHierarchical <- function(obj, k, df, keyVar="pct", facetScale="fixed") {
# FUNCtION ARGuMENTS:
# obj: clustering object from hierarchical clustering OR df with countyFIPS-cluster
# k: number of clusters to create
# df: data frame with burden data
# keyVar: the key variable to use in df
# Convert obj to a data frame if obj has been passed as an hclust object
if (!("data.frame") %in% class(obj)) {
if (!("hclust") %in% class(obj))
stop("\nMust pass object that is member of class data.frame or hclust\n")
# Print counts by cluster
cutree(obj, k=k) %>%
table() %>%
print()
# Convert obj to relevant data frame
obj <- cutree(obj, k=k) %>%
clustersToFrame(colNameName="countyFIPS")
}
# Make clustering data frame (obj has been converted to data.frame above if not already passed that way)
clData <- obj %>%
joinFrames(df) %>%
group_by(countyFIPS, name) %>%
mutate(delta=ifelse(row_number()==1, get(keyVar), get(keyVar)-lag(get(keyVar)))) %>%
ungroup() %>%
group_by(cluster, date, name) %>%
summarize(p05cum=quantile(get(keyVar), 0.05),
mucum=mean(get(keyVar)),
p95cum=quantile(get(keyVar), 0.95),
p05delta=quantile(delta, 0.05),
mudelta=mean(delta),
p95delta=quantile(delta, 0.95),
.groups="drop"
)
# Plot by cluster and metric
p1 <- clData %>%
ggplot(aes(x=date)) +
geom_line(aes(color=cluster, y=mucum), lwd=1) +
geom_ribbon(aes(ymin=p05cum, ymax=p95cum), alpha=0.25) +
facet_grid(c("tcpm7"="Cumulative cases", "tdpm7"="Cumulative deaths")[name]~cluster,
scales=facetScale
) +
scale_color_brewer(palette="Set1") +
labs(x=NULL, y="Cumulative", title="Mean and 90% range by cluster and metric")
print(p1)
# Plot incremental rather than cumulative
p2 <- clData %>%
ggplot(aes(x=date)) +
geom_line(aes(color=cluster, y=mudelta), lwd=1) +
geom_ribbon(aes(ymin=p05delta, ymax=p95delta), alpha=0.25) +
facet_grid(c("tcpm7"="Cases", "tdpm7"="Deaths")[name]~cluster, scales=facetScale) +
scale_color_brewer(palette="Set1") +
labs(x=NULL, y="Burden", title="Mean and 90% range by cluster and metric")
print(p2)
# Return the clustering frame
clData
}
clData_100k_k8 <- plotHierarchical(obj=select(dfClust100k_k8, countyFIPS, cluster),
df=p2DataAll,
keyVar="value",
facetScale="free_y"
)
## Joining, by = "countyFIPS"
Next, the existing algorithm is updated to assign every cluster to the closest cluster mean:
tempDistCluster <- function(dfClust, dfAll, metType="tdpm7", metClust="mucum", typeUse="pct") {
dfClust <- dfClust %>%
filter(name==metType) %>%
colSelector(vecSelect=c("cluster", "date", metClust)) %>%
colRenamer(vecRename=c("keyMetric") %>% purrr::set_names(metClust))
dfAll <- dfAll %>%
filter(name==metType) %>%
colSelector(vecSelect=c("countyFIPS", "date", typeUse)) %>%
colRenamer(vecRename=c("keyMetric") %>% purrr::set_names(typeUse))
dfClust %>%
left_join(dfAll, by="date") %>%
mutate(dx=ifelse(is.na(keyMetric.x), 0, keyMetric.x),
dy=ifelse(is.na(keyMetric.y), 0, keyMetric.y),
delta2=(dx-dy)**2
) %>%
group_by(cluster, countyFIPS) %>%
summarize(dist=sum(delta2), .groups="drop") %>%
arrange(countyFIPS, dist)
}
# Calculate distances to cluster mean, then print
countyClusterDist_new <- tempDistCluster(dfClust=clData_100k_k8, dfAll=p2DataAll, typeUse="value")
countyClusterDist_new
## # A tibble: 15,710 x 3
## cluster countyFIPS dist
## <fct> <chr> <dbl>
## 1 3 01001 9569283.
## 2 2 01001 67090707.
## 3 1 01001 88240896.
## 4 5 01001 353564016.
## 5 9 01001 983953954.
## 6 1 01003 14296132.
## 7 3 01003 59333163.
## 8 2 01003 191161474.
## 9 5 01003 563092256.
## 10 9 01003 1367670103.
## # ... with 15,700 more rows
# Assess overlap of cluster for counties already clustered
tmpDistDF_new <- dfClust100k_k8 %>%
select(countyFIPS, hierCluster=cluster) %>%
left_join(countyClusterDist_new, by="countyFIPS") %>%
arrange(countyFIPS, dist)
tmpDistDF_new %>%
group_by(countyFIPS) %>%
filter(row_number()==1) %>%
ungroup() %>%
mutate(cluster=factor(as.integer(as.character(cluster)))) %>%
count(hierCluster, cluster) %>%
group_by(hierCluster) %>%
mutate(pct=n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=cluster, y=hierCluster)) +
geom_tile(aes(fill=pct)) +
geom_text(aes(label=paste0("n=", n, "\n(", round(100*pct), "%)"))) +
scale_fill_continuous(low="white", high="lightgreen") +
labs(title="Overlap of assigned hierarchical cluster and closest hierarchical cluster mean",
x="Closest hierarchical cluster mean (k=5)",
y="Assigned hierarchical cluster (k=5)"
)
While there are some minor differences, overwhelmingly counties are already aligned to the closest cluster mean. The approach is then extended to counties with populations under 100k:
# Full clustering database
distOnlyClust_new <- countyClusterDist_new %>%
arrange(countyFIPS, dist) %>%
group_by(countyFIPS) %>%
filter(row_number()==1) %>%
ungroup() %>%
left_join(select(dfClust100k_k8, countyFIPS, hierCluster=cluster), by="countyFIPS")
# Cluster assignments depending on size of county
distOnlyClust_new %>%
ggplot(aes(x=cluster)) +
geom_bar(fill="lightblue") +
geom_text(stat="count", aes(y=..count../2, label=..count..)) +
facet_wrap(~ifelse(is.na(hierCluster), "Not in hierarchical data", "In hierarchical data"),
scales="free_y"
) +
labs(title="Closest hierarchical cluster")
# Plot all counties based on closest cluster
sparseCountyClusterMap(rename(distOnlyClust_new, fips=countyFIPS) %>%
mutate(cluster=factor(cluster, levels=c(1, 3, 2, 9, 5))),
brewPalette="viridis",
caption="Closest hierarchical cluster mean (euclidean distance of tdpm7)"
) +
labs(title="Lighter clusters generally have higher/earlier burden")
There continue to be meaningful geographic trends, with counties frequently adjacent to other counties in the same segment. Smaller counties are over-represented in groups with relatively high burden (9 and 2).
Next steps are to try to bring the shape dimension slightly more to the surface (in particular for segments 2 and 3 which have a fat 90% range early in the pandemic).
Data for segments 2 and 3 are extracted, then clustering is run solely for these groups:
# Create data containing only clusters 2 and 3
p2Data_Seg_23 <- distOnlyClust_new %>%
filter(hierCluster %in% c(2, 3)) %>%
select(countyFIPS, hierCluster) %>%
left_join(p2DataAll, by="countyFIPS")
p2Data_Seg_23
## # A tibble: 234,112 x 8
## countyFIPS hierCluster pop state date name value pct
## <chr> <fct> <dbl> <chr> <date> <chr> <dbl> <dbl>
## 1 01015 2 113605 AL 2020-01-25 tdpm7 0 0
## 2 01015 2 113605 AL 2020-01-25 tcpm7 0 0
## 3 01015 2 113605 AL 2020-01-26 tdpm7 0 0
## 4 01015 2 113605 AL 2020-01-26 tcpm7 0 0
## 5 01015 2 113605 AL 2020-01-27 tdpm7 0 0
## 6 01015 2 113605 AL 2020-01-27 tcpm7 0 0
## 7 01015 2 113605 AL 2020-01-28 tdpm7 0 0
## 8 01015 2 113605 AL 2020-01-28 tcpm7 0 0
## 9 01015 2 113605 AL 2020-01-29 tdpm7 0 0
## 10 01015 2 113605 AL 2020-01-29 tcpm7 0 0
## # ... with 234,102 more rows
# Create distance matrix and hierarchical clusters
p2Dist_Seg_23 <- p2Data_Seg_23 %>%
filter(name=="tdpm7") %>%
select(countyFIPS, date, value) %>%
pivotData(pivotKeys="countyFIPS", toLonger = FALSE, nameVar="date", valVar="value") %>%
column_to_rownames("countyFIPS") %>%
as.matrix() %>%
dist()
# Run clustering with method "complete"
p2Complete_Seg_23 <- hclust(p2Dist_Seg_23, method="complete")
plot(p2Complete_Seg_23)
# Plot cluster summaries
plotList_Seg_23 <- lapply(2:5,
FUN=function(x) plotHierarchical(p2Complete_Seg_23,
k=x,
df=filter(p2Data_Seg_23, pop>=100000),
keyVar="value",
facetScale="free_y"
)
)
## .
## 1 2
## 81 155
## Joining, by = "countyFIPS"
## .
## 1 2 3
## 81 144 11
## Joining, by = "countyFIPS"
## .
## 1 2 3 4
## 48 144 33 11
## Joining, by = "countyFIPS"
## .
## 1 2 3 4 5
## 46 2 144 33 11
## Joining, by = "countyFIPS"
# Create the four cluster summary
dfClust_Seg_23_k4 <- cutree(p2Complete_Seg_23, k=4) %>%
vecToTibble(colNameData="tempCluster", colNameName="countyFIPS") %>%
mutate(cluster=factor(tempCluster))
# Comparison to original hierarchical cluster
dfClust_Seg_23_k4 %>%
left_join(select(distOnlyClust_new, countyFIPS, hierCluster), by="countyFIPS") %>%
count(hierCluster, cluster)
## # A tibble: 4 x 3
## hierCluster cluster n
## <fct> <fct> <int>
## 1 2 1 48
## 2 2 3 33
## 3 3 2 144
## 4 3 4 11
# Map of the new clusters
dfClust_Seg_23_k4 %>%
select(fips=countyFIPS, cluster) %>%
usmap::plot_usmap(regions="counties", data=., values="cluster") +
scale_fill_brewer("Cluster", palette="Set1")
# Plot the four cluster summary
clData_Seg_23_k4 <- plotHierarchical(obj=select(dfClust_Seg_23_k4, countyFIPS, cluster),
df=p2Data_Seg_23,
keyVar="value",
facetScale="free_y"
)
## Joining, by = "countyFIPS"
As expected, each of the original segments is split, helping to bring out a mix of shape and total burden:
While small, there is meaningful value in the ~2000 deaths per million cluster with early disease. It is focused on major metros (Chicago, Detroit, Philadelphia, New Orleans, NYC collar) that had early spikes similar to core NYC counties but without the associated high levels of total burden. These are distinct from the much larger group of counties that got to ~2000 deaths per million with a much later timing.
Next steps are to create a full clustering database and extend to counties with under 100k population.
First, the segments for counties with 100k+ are combined:
dfClust100k_full <- dfClust100k_k8 %>%
select(countyFIPS, origCluster=cluster) %>%
full_join(select(dfClust_Seg_23_k4, countyFIPS, newCluster=cluster), by="countyFIPS") %>%
mutate(finalCluster=ifelse(is.na(newCluster),
as.integer(as.character(origCluster)),
10*as.integer(as.character(origCluster)) + as.integer(as.character(newCluster))
),
finalCluster=factor(finalCluster, levels=sort(unique(finalCluster)))
)
dfClust100k_full
## # A tibble: 599 x 4
## countyFIPS origCluster newCluster finalCluster
## <chr> <fct> <fct> <fct>
## 1 01003 1 <NA> 1
## 2 01015 2 1 21
## 3 01055 2 1 21
## 4 01069 2 1 21
## 5 01073 2 1 21
## 6 01081 1 <NA> 1
## 7 01089 1 <NA> 1
## 8 01097 3 2 32
## 9 01101 2 3 23
## 10 01103 2 1 21
## # ... with 589 more rows
dfClust100k_full %>%
count(origCluster, newCluster, finalCluster)
## # A tibble: 7 x 4
## origCluster newCluster finalCluster n
## <fct> <fct> <fct> <int>
## 1 1 <NA> 1 319
## 2 2 1 21 48
## 3 2 3 23 33
## 4 3 2 32 144
## 5 3 4 34 11
## 6 9 <NA> 9 21
## 7 5 <NA> 5 23
# Plot the seven cluster summary
clData_Seg_100k_full <- plotHierarchical(obj=select(dfClust100k_full, countyFIPS, cluster=finalCluster),
df=p2DataAll,
keyVar="value",
facetScale="free_y"
)
## Joining, by = "countyFIPS"
# Map of the new clusters
dfClust100k_full %>%
select(fips=countyFIPS, cluster=finalCluster) %>%
usmap::plot_usmap(regions="counties", data=., values="cluster") +
scale_fill_brewer("Cluster", palette="Set1")
Smaller counties are then mapped to the nearest segment average:
# Calculate distances to cluster mean, then print
countyClusterDist_100k_full <- tempDistCluster(dfClust=clData_Seg_100k_full, dfAll=p2All, typeUse="value")
countyClusterDist_100k_full
## # A tibble: 21,994 x 3
## cluster countyFIPS dist
## <fct> <chr> <dbl>
## 1 32 01001 6770143.
## 2 21 01001 46397990.
## 3 1 01001 88240896.
## 4 34 01001 123376092.
## 5 23 01001 125257271.
## 6 5 01001 353564016.
## 7 9 01001 983953954.
## 8 1 01003 14296132.
## 9 32 01003 51092603.
## 10 21 01003 143224481.
## # ... with 21,984 more rows
# Assess overlap of cluster for counties already clusterd
tmpDistDF_100k_full <- dfClust100k_full %>%
select(countyFIPS, finalCluster) %>%
left_join(countyClusterDist_100k_full, by="countyFIPS") %>%
arrange(countyFIPS, dist)
tmpDistDF_100k_full %>%
group_by(countyFIPS) %>%
filter(row_number()==1) %>%
ungroup() %>%
mutate(cluster=factor(as.integer(as.character(cluster)))) %>%
count(hierCluster=finalCluster, cluster) %>%
group_by(hierCluster) %>%
mutate(pct=n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=cluster, y=hierCluster)) +
geom_tile(aes(fill=pct)) +
geom_text(aes(label=paste0("n=", n, "\n(", round(100*pct), "%)"))) +
scale_fill_continuous(low="white", high="lightgreen") +
labs(title="Overlap of assigned hierarchical cluster and closest hierarchical cluster mean",
x="Closest hierarchical cluster mean (k=5)",
y="Assigned hierarchical cluster (k=5)"
)
# Full clustering database
distOnlyClust_100k_full <- countyClusterDist_100k_full %>%
arrange(countyFIPS, dist) %>%
group_by(countyFIPS) %>%
filter(row_number()==1) %>%
ungroup() %>%
left_join(select(dfClust100k_full, countyFIPS, hierCluster=finalCluster), by="countyFIPS")
# Cluster assignments depending on size of county
distOnlyClust_100k_full %>%
ggplot(aes(x=cluster)) +
geom_bar(fill="lightblue") +
geom_text(stat="count", aes(y=..count../2, label=..count..)) +
facet_wrap(~ifelse(is.na(hierCluster), "Not in hierarchical data", "In hierarchical data"),
scales="free_y"
) +
labs(title="Closest hierarchical cluster")
# Plot all counties based on closest cluster
sparseCountyClusterMap(rename(distOnlyClust_100k_full, fips=countyFIPS) %>%
mutate(cluster=factor(cluster, levels=c(1, 32, 34, 21, 23, 5, 9))),
brewPalette="viridis",
caption="Closest hierarchical cluster mean (euclidean distance of % cumulative death)"
) +
labs(title="Lighter clusters have higher burden and/or higher percentage of burden early")
There continues to be meaningful patterns by geography. Next steps are to download the latest data and run the full segment diagnostics.
New data are downloaded, with the above segments applied:
readList <- list("usafCase"="./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20210622.csv",
"usafDeath"="./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20210622.csv"
)
compareList <- list("usafCase"=cty_newdata_20210608$dfRaw$usafCase,
"usafDeath"=cty_newdata_20210608$dfRaw$usafDeath
)
dfClust <- distOnlyClust_100k_full %>%
select(countyFIPS, cluster) %>%
mutate(cluster=as.character(cluster)) %>%
left_join(getCountyData(), by="countyFIPS") %>%
mutate(origCluster=cluster, cluster=ifelse(pop>=25000, cluster, "999"))
useClusters <- as.character(dfClust$cluster) %>%
purrr::set_names(dfClust$countyFIPS)
# Create new clusters
cty_newdata_20210622 <- readRunUSAFacts(maxDate="2021-06-20",
downloadTo=lapply(readList,
FUN=function(x) if(file.exists(x)) NA else x
),
readFrom=readList,
compareFile=compareList,
writeLog="./RInputFiles/Coronavirus/USAF_NewData_20210622_v005.log",
ovrwriteLog=TRUE,
useClusters=useClusters,
skipAssessmentPlots=FALSE,
brewPalette="Paired"
)
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character(),
## StateFIPS = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 14
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20210622_v005.log
##
## Checking for similarity of: county
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 0 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210622_v005.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 0 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210622_v005.log
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character(),
## StateFIPS = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 14
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20210622_v005.log
##
## Checking for similarity of: county
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 0 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210622_v005.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 0 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210622_v005.log
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
## isType cases new_cases n
## <chr> <dbl> <dbl> <dbl>
## 1 before 6.66e+9 3.29e+7 1647588
## 2 after 6.63e+9 3.28e+7 1621272
## 3 pctchg 4.52e-3 3.78e-3 0.0160
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
## isType deaths new_deaths n
## <chr> <dbl> <dbl> <dbl>
## 1 before 1.34e+8 595239 1647588
## 2 after 1.34e+8 593224 1621272
## 3 pctchg 3.70e-3 0.00339 0.0160
## NULL
# Plot all counties based on closest cluster
sparseCountyClusterMap(cty_newdata_20210622$useClusters,
caption="Includes only counties with 25k+ population",
brewPalette="viridis"
)
saveToRDS(cty_newdata_20210622, ovrWriteError=FALSE)
The updated county clustering routines appear to work as intended. Next steps are to update the CDC excess deaths process.
The latest data are downloaded, with existing clusters applied:
readList <- list("usafCase"="./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20210813.csv",
"usafDeath"="./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20210813.csv"
)
compareList <- list("usafCase"=readFromRDS("cty_newdata_20210622")$dfRaw$usafCase,
"usafDeath"=readFromRDS("cty_newdata_20210622")$dfRaw$usafDeath
)
# Create new clusters
cty_newdata_20210813 <- readRunUSAFacts(maxDate="2021-08-11",
downloadTo=lapply(readList,
FUN=function(x) if(file.exists(x)) NA else x
),
readFrom=readList,
compareFile=compareList,
writeLog="./RInputFiles/Coronavirus/USAF_NewData_20210813_v005.log",
ovrwriteLog=TRUE,
useClusters=readFromRDS("cty_newdata_20210622")$useClusters,
skipAssessmentPlots=FALSE,
brewPalette="Paired"
)
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20210813.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character(),
## StateFIPS = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 51
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20210813_v005.log
##
## Checking for similarity of: county
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 0 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210813_v005.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 0 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210813_v005.log
##
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20210813.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character(),
## StateFIPS = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 51
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20210813_v005.log
##
## Checking for similarity of: county
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 0 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210813_v005.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 0 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210813_v005.log
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
## isType cases new_cases n
## <chr> <dbl> <dbl> <dbl>
## 1 before 8.38e+9 3.54e+7 1810431
## 2 after 8.34e+9 3.52e+7 1781514
## 3 pctchg 4.49e-3 5.53e-3 0.0160
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
## isType deaths new_deaths n
## <chr> <dbl> <dbl> <dbl>
## 1 before 1.65e+8 611955 1810431
## 2 after 1.64e+8 607482 1781514
## 3 pctchg 3.93e-3 0.00731 0.0160
## NULL
# Plot all counties based on closest cluster
sparseCountyClusterMap(cty_newdata_20210813$useClusters,
caption="Includes only counties with 25k+ population",
brewPalette="viridis"
)
# Dates with large amounts of negative restatement
cty_newdata_20210813$dfPerCapita %>%
group_by(date) %>%
summarize(nNeg=sum(new_cases < 0), nPos=sum(new_cases > 0), new_cases=sum(new_cases)) %>%
arrange(-nNeg)
## # A tibble: 567 x 4
## date nNeg nPos new_cases
## <date> <int> <int> <dbl>
## 1 2021-07-09 310 1797 -399248
## 2 2020-12-28 266 2372 216179
## 3 2020-12-25 262 1350 49223
## 4 2020-10-24 256 2230 53241
## 5 2021-06-11 205 1706 -9167
## 6 2021-06-23 191 1629 15003
## 7 2021-06-18 184 1442 20155
## 8 2021-06-25 183 1545 24105
## 9 2021-04-16 174 2237 65229
## 10 2021-06-30 174 1610 10814
## # ... with 557 more rows
# Restatements taking place between July 4-14, 2021
cty_newdata_20210813$dfPerCapita %>%
group_by(date) %>%
summarize(nNeg=sum(new_cases < 0), nPos=sum(new_cases > 0), new_cases=sum(new_cases)) %>%
filter(date >= "2021-07-04", date <= "2021-07-14")
## # A tibble: 11 x 4
## date nNeg nPos new_cases
## <date> <int> <int> <dbl>
## 1 2021-07-04 8 333 2737
## 2 2021-07-05 12 395 4228
## 3 2021-07-06 57 1617 27035
## 4 2021-07-07 97 1756 21302
## 5 2021-07-08 129 1613 19276
## 6 2021-07-09 310 1797 -399248
## 7 2021-07-10 17 653 452834
## 8 2021-07-11 8 362 3435
## 9 2021-07-12 59 1596 37864
## 10 2021-07-13 89 1588 21977
## 11 2021-07-14 97 1984 32297
# Restatements taking place between December 21, 2020 and January 6, 2021
cty_newdata_20210813$dfPerCapita %>%
group_by(date) %>%
summarize(nNeg=sum(new_cases < 0), nPos=sum(new_cases > 0), new_cases=sum(new_cases)) %>%
filter(date >= "2020-12-21", date <= "2021-01-06")
## # A tibble: 17 x 4
## date nNeg nPos new_cases
## <date> <int> <int> <dbl>
## 1 2020-12-21 8 2659 362667
## 2 2020-12-22 2 2514 174503
## 3 2020-12-23 0 2567 227200
## 4 2020-12-24 3 2345 185122
## 5 2020-12-25 262 1350 49223
## 6 2020-12-26 33 1964 170262
## 7 2020-12-27 5 2018 121643
## 8 2020-12-28 266 2372 216179
## 9 2020-12-29 5 2510 229421
## 10 2020-12-30 20 2698 238917
## 11 2020-12-31 32 2406 222938
## 12 2021-01-01 4 1603 190655
## 13 2021-01-02 0 2346 283820
## 14 2021-01-03 8 2470 229336
## 15 2021-01-04 4 2484 169859
## 16 2021-01-05 0 2773 238758
## 17 2021-01-06 0 2885 256350
saveToRDS(cty_newdata_20210813, ovrWriteError=FALSE)
There is a very large restatement of new_cases data occurring on or around July 8-11, 2021. Perhaps smoothing can be considered where each value for July 8-11, 2021 data is treated as the average across those four days. There is a similar though smaller anomaly in the December 24-29, 2020 range.
Dates are examined for those with the largest number of negative revisions in new_cases and/or new_deaths:
# All restatements
plotData <- cty_newdata_20210813$dfPerCapita %>%
select(date, new_cases, new_deaths, cpm7, dpm7) %>%
pivot_longer(-date) %>%
filter(!is.na(value)) %>%
group_by(date, name) %>%
summarize(nNeg=sum(value <= -1), sumNeg=sum(ifelse(value < 0, value, 0)), .groups="drop")
tempMapper <- c("cpm7"="Cases per million (7-day mean)",
"dpm7"="Deaths per million (7-day mean)",
"new_cases"="New cases",
"new_deaths"="New deaths"
)
plotData %>%
ggplot(aes(x=date, y=nNeg)) +
geom_line() +
facet_wrap(~tempMapper[name]) +
labs(x=NULL,
y="Number of records",
title="Counties reporting less than or equal to -1 for metric on day"
)
plotData %>%
ggplot(aes(x=date, y=sumNeg)) +
geom_line() +
facet_wrap(~tempMapper[name], scales="free_y") +
labs(x=NULL,
y="Sum of value for counties recording negative",
title="Counties reporting less than 0 for metric on day"
)
For records with negative values, the rolling-7 is calculated around the day of the negative:
# All restatements
plotData <- cty_newdata_20210813$dfPerCapita %>%
select(date, countyFIPS, new_cases, new_deaths, cpm7, dpm7) %>%
pivot_longer(-c(date, countyFIPS)) %>%
filter(!is.na(value)) %>%
group_by(countyFIPS, name) %>%
arrange(date) %>%
mutate(value7=zoo::rollmean(value, k=7, fill=NA),
isNeg=(value <= -1),
isNeg7=(is.na(value7) | value7 < 0)
) %>%
group_by(date, name) %>%
summarize(nNeg=sum(isNeg & isNeg7), sumNeg=sum(ifelse(isNeg & isNeg7, value, 0)), .groups="drop")
tempMapper <- c("cpm7"="Cases per million (7-day mean)",
"dpm7"="Deaths per million (7-day mean)",
"new_cases"="New cases",
"new_deaths"="New deaths"
)
plotData %>%
ggplot(aes(x=date, y=nNeg)) +
geom_line() +
facet_wrap(~tempMapper[name]) +
labs(x=NULL,
y="Number of records",
title="Counties reporting negative value that are also negative over the 7-day window"
)
plotData %>%
ggplot(aes(x=date, y=sumNeg)) +
geom_line() +
facet_wrap(~tempMapper[name], scales="free_y") +
labs(x=NULL,
y="Sum of value",
title="Counties reporting less than 0 for metric on day that are also negative over the 7-day window"
)
cty_newdata_20210813$dfPerCapita %>%
select(date, countyFIPS, new_cases, new_deaths, cpm7, dpm7) %>%
pivot_longer(-c(date, countyFIPS)) %>%
filter(!is.na(value)) %>%
group_by(countyFIPS, name) %>%
arrange(date) %>%
mutate(value7=zoo::rollmean(value, k=7, fill=NA),
isNeg=(value <= -1),
isNeg7=(is.na(value7) | value7 < 0)
) %>%
ungroup() %>%
group_by(name, isNeg, isNeg7) %>%
summarize(n=n(), sumValue=sum(value))
## `summarise()` has grouped output by 'name', 'isNeg'. You can override using the `.groups` argument.
## # A tibble: 16 x 5
## # Groups: name, isNeg [8]
## name isNeg isNeg7 n sumValue
## <chr> <lgl> <lgl> <int> <dbl>
## 1 cpm7 FALSE FALSE 1701869 336714571.
## 2 cpm7 FALSE TRUE 45946 3248568.
## 3 cpm7 TRUE FALSE 3859 -265077.
## 4 cpm7 TRUE TRUE 10988 -2790977.
## 5 dpm7 FALSE FALSE 1627092 6667360.
## 6 dpm7 FALSE TRUE 126669 20031.
## 7 dpm7 TRUE FALSE 678 -4144.
## 8 dpm7 TRUE TRUE 8223 -102243.
## 9 new_cases FALSE FALSE 1732888 35567937
## 10 new_cases FALSE TRUE 30335 362101
## 11 new_cases TRUE FALSE 14909 -600467
## 12 new_cases TRUE TRUE 3382 -114828
## 13 new_deaths FALSE FALSE 1749115 616693
## 14 new_deaths FALSE TRUE 27664 2807
## 15 new_deaths TRUE FALSE 3087 -7199
## 16 new_deaths TRUE TRUE 1648 -4819
Making an adjustment for situations where a metrics is negative but the 7-day total is non-negative can help to smooth some restatement anomalies. There sill still be restatements in the data, and these seem to increasein the more recent months of the data.
Next steps are to build a function to manage the smoothing, with a goal to incorporate this as part of per-capita processing.
A function is built to take an ordered vector and to replace negative values, and all of the surrounding n values, with the mean of those n values:
smoothNegativeValues <- function(x,
negTol=-0.001,
windowSize=3,
windowAlign="center",
...
) {
# FUNCTION ARGUMENTS:
# x: a vector that has been sorted such that successive observations are sequential
# negTol: tolerance for considering a value in x to be negative
# windowSize: the size for the window to use
# windowAlign: the alignment for the window (passed to zoo::rollmean)
# ...: any other arguments to pass to zoo::rollmean
# Get the rolling means for x
muRoll <- zoo::rollmean(x, k=windowSize, fill=NA, align=windowAlign, ...)
# Find the negative values and those that can be replaced
negVals <- which(x < negTol)
okFix <- which(!is.na(muRoll) & muRoll >= negTol)
negFix <- intersect(negVals, okFix)
# Replace the negative values with the rolling mean
# CAUTION: this will double over-write if negative values are very close to each other
# CAUTION: only works for centering for the time being
eachSide <- ceiling((windowSize-1)/2)
for (idxFix in negFix) x[(idxFix-eachSide):(idxFix+eachSide)] <- muRoll[idxFix]
# Return the updated vector
x
}
# Simple example, works well
smoothNegativeValues(c(1:5, 10, -2, 10, 5:1))
## [1] 1 2 3 4 5 6 6 6 5 4 3 2 1
# More complex example, double overwriting leads to incorrect vector sum
smoothNegativeValues(c(1:5, 14, -2, -2, 14, 5:1)) %>% round(1)
## [1] 1.0 2.0 3.0 4.0 5.0 3.3 3.3 3.3 3.3 5.0 4.0 3.0 2.0 1.0
# All counties with negatives
allNegCases <- cty_newdata_20210813$dfPerCapita %>%
select(countyFIPS, state, date, new_cases) %>%
group_by(countyFIPS) %>%
filter(new_cases <= -0.5) %>%
ungroup() %>%
arrange(new_cases)
allNegCases
## # A tibble: 18,291 x 4
## countyFIPS state date new_cases
## <chr> <chr> <date> <dbl>
## 1 48113 TX 2021-07-09 -43526
## 2 48439 TX 2021-07-09 -43415
## 3 48029 TX 2021-07-09 -41518
## 4 48215 TX 2021-07-09 -30342
## 5 55053 WI 2020-12-27 -21000
## 6 48121 TX 2021-07-09 -20662
## 7 48029 TX 2020-12-25 -18856
## 8 48085 TX 2021-07-09 -17026
## 9 06037 CA 2020-12-25 -12459
## 10 48355 TX 2021-07-09 -12254
## # ... with 18,281 more rows
allNegCases %>%
summarize(n=n(), new_cases=sum(new_cases))
## # A tibble: 1 x 2
## n new_cases
## <int> <dbl>
## 1 18291 -715295
# Function to check data in a key county
countyCheck <- function(df, keyDate, keyFIPS, windowSize=7) {
if (!("Date" %in% class(keyDate))) keyDate <- as.Date(keyDate)
ctyData <- df %>%
filter(countyFIPS==keyFIPS)
ctyData %>%
filter(round(new_cases) < 0) %>%
print()
ctyData <- ctyData %>%
mutate(smoothCases=smoothNegativeValues(new_cases, windowSize=windowSize))
ctyData %>%
filter(date >= keyDate - ceiling((windowSize-1)/2), date <= keyDate + ceiling((windowSize-1)/2)) %>%
select(countyFIPS, state, date, cases, new_cases, smoothCases) %>%
print()
p1 <- ctyData %>%
select(countyFIPS, state, date, new_cases, smoothCases) %>%
pivot_longer(-c(countyFIPS, state, date)) %>%
ggplot(aes(x=date, y=value)) +
geom_line(aes(group=name, color=name)) +
facet_wrap(~c("new_cases"="Raw cases", "smoothCases"="Smoothed cases")[name]) +
labs(x=NULL, y="Cases", title=paste0("Cases data for county ", keyFIPS)) +
theme(legend.position="none")
print(p1)
ctyData %>%
select(countyFIPS, state, date, new_cases, smoothCases) %>%
summarize(across(where(is.numeric), sum, na.rm=TRUE)) %>%
print()
}
# Data for countyFIPS 48113 (Texas, negative values on July 9)
countyCheck(cty_newdata_20210813$dfPerCapita, keyDate="2021-07-09", keyFIPS="48113")
## # A tibble: 1 x 15
## countyFIPS state date cases new_cases deaths new_deaths tcpm cpm
## <chr> <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 48113 TX 2021-07-09 263826 -43526 4139 4 100104. -16515.
## # ... with 6 more variables: tdpm <dbl>, dpm <dbl>, tcpm7 <dbl>, cpm7 <dbl>,
## # tdpm7 <dbl>, dpm7 <dbl>
## # A tibble: 7 x 6
## countyFIPS state date cases new_cases smoothCases
## <chr> <chr> <date> <dbl> <dbl> <dbl>
## 1 48113 TX 2021-07-06 306870 0 147.
## 2 48113 TX 2021-07-07 307033 163 147.
## 3 48113 TX 2021-07-08 307352 319 147.
## 4 48113 TX 2021-07-09 263826 -43526 147.
## 5 48113 TX 2021-07-10 307901 44075 147.
## 6 48113 TX 2021-07-11 307901 0 147.
## 7 48113 TX 2021-07-12 307901 0 147.
## # A tibble: 1 x 2
## new_cases smoothCases
## <dbl> <dbl>
## 1 323669 323669
# Examples with repeating negative values
repNegCases <- cty_newdata_20210813$dfPerCapita %>%
select(countyFIPS, state, date, new_cases) %>%
group_by(countyFIPS) %>%
filter(new_cases <= -0.5, (lag(new_cases) <= -0.5 | lead(new_cases) <= -0.5)) %>%
ungroup() %>%
arrange(new_cases)
repNegCases
## # A tibble: 2,278 x 4
## countyFIPS state date new_cases
## <chr> <chr> <date> <dbl>
## 1 48471 TX 2021-07-09 -870
## 2 48325 TX 2021-06-12 -406
## 3 21235 KY 2021-06-15 -390
## 4 48025 TX 2021-07-09 -390
## 5 48185 TX 2021-07-09 -390
## 6 21235 KY 2021-06-16 -380
## 7 48201 TX 2021-06-17 -345
## 8 06025 CA 2021-06-30 -336
## 9 48069 TX 2021-07-09 -314
## 10 21235 KY 2021-06-12 -309
## # ... with 2,268 more rows
repNegCases %>%
summarize(n=n(), new_cases=sum(new_cases))
## # A tibble: 1 x 2
## n new_cases
## <int> <dbl>
## 1 2278 -16455
# Data for countyFIPS 48471 (Texas, repeating negative values on/around July 9)
countyCheck(cty_newdata_20210813$dfPerCapita, keyDate="2021-07-09", keyFIPS="48471")
## # A tibble: 52 x 15
## countyFIPS state date cases new_cases deaths new_deaths tcpm cpm
## <chr> <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 48471 TX 2020-08-05 2838 -4 40 2 38892. -54.8
## 2 48471 TX 2020-08-16 3189 -1 43 0 43702. -13.7
## 3 48471 TX 2020-09-03 3417 -246 48 0 46827. -3371.
## 4 48471 TX 2020-09-04 3415 -2 48 0 46799. -27.4
## 5 48471 TX 2020-09-09 3403 -451 51 1 46635. -6181.
## 6 48471 TX 2020-09-12 3370 -46 52 0 46183. -630.
## 7 48471 TX 2020-09-17 3567 -119 53 0 48882. -1631.
## 8 48471 TX 2020-09-23 3690 -3 56 0 50568. -41.1
## 9 48471 TX 2020-09-24 3689 -1 57 1 50554. -13.7
## 10 48471 TX 2020-10-07 3788 -31 57 0 51911. -425.
## # ... with 42 more rows, and 6 more variables: tdpm <dbl>, dpm <dbl>,
## # tcpm7 <dbl>, cpm7 <dbl>, tdpm7 <dbl>, dpm7 <dbl>
## # A tibble: 7 x 6
## countyFIPS state date cases new_cases smoothCases
## <chr> <chr> <date> <dbl> <dbl> <dbl>
## 1 48471 TX 2021-07-06 8703 2 3.29
## 2 48471 TX 2021-07-07 8757 54 3.29
## 3 48471 TX 2021-07-08 8718 -39 3.29
## 4 48471 TX 2021-07-09 7848 -870 3.29
## 5 48471 TX 2021-07-10 8718 870 3.29
## 6 48471 TX 2021-07-11 8721 3 1.29
## 7 48471 TX 2021-07-12 8724 3 1.29
## # A tibble: 1 x 2
## new_cases smoothCases
## <dbl> <dbl>
## 1 9240 9393.
Curing the single negative value issue addresses over 95% of the observed issue with new_cases.
Despite the limitations, this appears promising for further exploration. The function is applied to the full per-capita dataset:
# Smoothed cases for all counties
perCapTemp <- cty_newdata_20210813$dfPerCapita %>%
select(countyFIPS, state, date, cases, new_cases) %>%
group_by(countyFIPS, state) %>%
mutate(smoothCases=smoothNegativeValues(new_cases, windowSize=7),
cumNew=cumsum(new_cases),
cumSmooth=cumsum(smoothCases),
smooth7=zoo::rollmean(smoothCases, k=7, fill=NA),
cases7=zoo::rollmean(new_cases, k=7, fill=NA)
) %>%
ungroup()
perCapTemp
## # A tibble: 1,781,514 x 10
## countyFIPS state date cases new_cases smoothCases cumNew cumSmooth
## <chr> <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 01001 AL 2020-01-22 0 0 0 0 0
## 2 01003 AL 2020-01-22 0 0 0 0 0
## 3 01005 AL 2020-01-22 0 0 0 0 0
## 4 01007 AL 2020-01-22 0 0 0 0 0
## 5 01009 AL 2020-01-22 0 0 0 0 0
## 6 01011 AL 2020-01-22 0 0 0 0 0
## 7 01013 AL 2020-01-22 0 0 0 0 0
## 8 01015 AL 2020-01-22 0 0 0 0 0
## 9 01017 AL 2020-01-22 0 0 0 0 0
## 10 01019 AL 2020-01-22 0 0 0 0 0
## # ... with 1,781,504 more rows, and 2 more variables: smooth7 <dbl>,
## # cases7 <dbl>
perCapTemp %>%
summarize(across(where(is.numeric), sum, na.rm=TRUE))
## # A tibble: 1 x 7
## cases new_cases smoothCases cumNew cumSmooth smooth7 cases7
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 8343470745 35214743 35220770 8343470745 8344928995. 34880628. 34874611.
plotTemp <- perCapTemp %>%
group_by(date) %>%
summarize(across(where(is.numeric), specNA(sum))) %>%
pivot_longer(-date)
# Plot #1: Cumulative cases by technique
plotTemp %>%
filter(name %in% c("cases", "cumNew", "cumSmooth")) %>%
ggplot(aes(x=date, y=value/1000000)) +
geom_line(aes(group=name,
color=c("cases"="Cases", "cumNew"="Sum new_cases", "cumSmooth"="Sum smoothed cases")[name]
)
) +
labs(x=NULL, y="Cumulative Cases (millions)", title="Cumulative case evolution by approach") +
scale_color_discrete("Technique") +
theme(legend.position="bottom")
# Plot #2: new_cases vs smoothed cases
plotTemp %>%
filter(name %in% c("new_cases", "smoothCases")) %>%
ggplot(aes(x=date, y=value/1000)) +
geom_line(aes(group=name,
color=c("new_cases"="Raw new_cases", "smoothCases"="Smoothed cases")[name]
)
) +
labs(x=NULL, y="New Cases (000s)", title="New case evolution by approach") +
scale_color_discrete("Technique") +
theme(legend.position="bottom")
# Plot #3: Rolling-7 mean for new_cases vs smoothed cases
plotTemp %>%
filter(name %in% c("cases7", "smooth7")) %>%
ggplot(aes(x=date, y=value/1000)) +
geom_line(aes(group=name,
color=c("cases7"="new_cases", "smooth7"="Smoothed cases")[name]
)
) +
labs(x=NULL, y="New Cases (000s) - rolling 7-day mean", title="New case evolution by approach") +
scale_color_discrete("Technique") +
theme(legend.position="bottom")
## Warning: Removed 12 row(s) containing missing values (geom_path).
At a national level, the smoothing significantly helps with the July negative/positive restatement, as well as providing some assistance in other time periods. Plots are made for each state using the smoothed data:
perCapTemp %>%
group_by(state, date) %>%
summarize(smooth7=specNA(sum)(smooth7), .groups="drop") %>%
ggplot(aes(x=date, y=smooth7/1000)) +
geom_line() +
labs(x=NULL, y="Cases (000s)", title="Smoothed rolling-7 mean cases per day by state") +
facet_wrap(~state, scales="free_y")
## Warning: Removed 6 row(s) containing missing values (geom_path).
Several artifacts are observed, including:
These can be further explored as a next step, with particular focus on Florida (recency) and the late 2020 pattern observed across multiple states. States are assessed for the occurrence of zero casedays after July 1, 2020:
perCapGroup <- perCapTemp %>%
mutate(epi=paste0(lubridate::epiyear(date), "-", zeroPad2(lubridate::epiweek(date)))) %>%
group_by(state, date, epi) %>%
summarize(new_cases=sum(new_cases, na.rm=TRUE), n=n(), .groups="drop")
perCapGroup
## # A tibble: 28,917 x 5
## state date epi new_cases n
## <chr> <date> <chr> <dbl> <int>
## 1 AK 2020-01-22 2020-04 0 29
## 2 AK 2020-01-23 2020-04 0 29
## 3 AK 2020-01-24 2020-04 0 29
## 4 AK 2020-01-25 2020-04 0 29
## 5 AK 2020-01-26 2020-05 0 29
## 6 AK 2020-01-27 2020-05 0 29
## 7 AK 2020-01-28 2020-05 0 29
## 8 AK 2020-01-29 2020-05 0 29
## 9 AK 2020-01-30 2020-05 0 29
## 10 AK 2020-01-31 2020-05 0 29
## # ... with 28,907 more rows
perCapGroup %>%
filter(date >= "2020-07-01", round(new_cases) <= 0) %>%
group_by(state) %>%
mutate(nDays=n()) %>%
ungroup() %>%
ggplot(aes(x=date, y=fct_reorder(state, nDays))) +
geom_tile(aes(fill=ifelse(round(new_cases)==0, "blue", "red"))) +
scale_fill_identity() +
labs(x=NULL,
y=NULL,
title="States not reporting positive cases by day",
subtitle="Blue is zero, red is negative"
)
perCapGroup %>%
filter(epi >= "2020-26", epi <= "2021-31") %>%
group_by(state, epi) %>%
summarize(nPos=sum(round(new_cases)>0), .groups="drop") %>%
group_by(state) %>%
mutate(nTot=sum(nPos)) %>%
ungroup() %>%
ggplot(aes(x=epi, y=fct_reorder(state, nTot))) +
geom_tile(aes(fill=factor(nPos, levels=0:7))) +
coord_flip() +
scale_fill_viridis_d("# days") +
labs(x=NULL,
y=NULL,
title="Number of days in epiweek with positive cases reported for state"
)
Reporting appears to be less frequent in many states in 2021. Due to holidays and other artifacts, reporting may not have a strict 7-day seasonality, thus smoothing and rolling-7 calculations may still be spiky. This is particularly prevalent around the late winter holiday (mid-December to early-January), which may drive spikes observed in late 2020.
Nebraska appears to have stopped reporting new_cases data at the county level and Rhode Island appears to have always reported on a once-per-week basis.
Particular attention should be paid to states that show a pattern by epiweek (e.g., always 5 or always 7) that then changes, especially if on a temporary basis. That appears to flag many of the states that have spiky cases, even after smoothing, in late 2020.
States and epiweeks that meet the following criteria are extracted:
Example code includes:
perCapReport <- perCapGroup %>%
group_by(state, epi) %>%
summarize(nPos=sum(round(new_cases)>0), .groups="drop") %>%
arrange(state, epi) %>%
group_by(state) %>%
mutate(dPrev=ifelse(row_number()==1, 0, abs(lag(nPos)-nPos)/pmax(1, lag(nPos)+nPos)),
dNext=ifelse(row_number()==n(), 0, abs(lead(nPos)-nPos)/pmax(1, lead(nPos)+nPos))
) %>%
ungroup()
perCapReport
## # A tibble: 4,182 x 5
## state epi nPos dPrev dNext
## <chr> <chr> <int> <dbl> <dbl>
## 1 AK 2020-04 0 0 0
## 2 AK 2020-05 0 0 0
## 3 AK 2020-06 0 0 0
## 4 AK 2020-07 0 0 0
## 5 AK 2020-08 0 0 0
## 6 AK 2020-09 0 0 0
## 7 AK 2020-10 0 0 1
## 8 AK 2020-11 1 1 0.5
## 9 AK 2020-12 3 0.5 0.4
## 10 AK 2020-13 7 0.4 0
## # ... with 4,172 more rows
perCapAnomaly <- perCapReport %>%
mutate(dMax=pmax(dPrev, dNext)) %>%
filter(epi >= "2020-26", epi <= "2021-30", dMax >= 0.3) %>%
group_by(state) %>%
mutate(n=n()) %>%
ungroup()
perCapAnomaly
## # A tibble: 123 x 7
## state epi nPos dPrev dNext dMax n
## <chr> <chr> <int> <dbl> <dbl> <dbl> <int>
## 1 AK 2020-46 7 0 0.4 0.4 5
## 2 AK 2020-47 3 0.4 0.5 0.5 5
## 3 AK 2020-48 1 0.5 0 0.5 5
## 4 AK 2020-49 1 0 0.75 0.75 5
## 5 AK 2020-50 7 0.75 0 0.75 5
## 6 AZ 2020-51 7 0 0.75 0.75 6
## 7 AZ 2020-52 1 0.75 0.714 0.75 6
## 8 AZ 2020-53 6 0.714 0.0769 0.714 6
## 9 AZ 2021-07 7 0 0.556 0.556 6
## 10 AZ 2021-08 2 0.556 0.429 0.556 6
## # ... with 113 more rows
perCapAnomaly%>%
ggplot(aes(x=fct_reorder(state, -n), y=epi)) +
geom_tile(aes(fill=factor(nPos, levels=0:7))) +
scale_fill_viridis_d("# days") +
labs(x=NULL, y=NULL, title="Potential reporting discontinuities by state and epiweek")
# Examples for KS data
perCapGroup %>%
semi_join(perCapAnomaly, by=c("state", "epi")) %>%
ggplot(aes(x=date, y=new_cases)) +
geom_point() +
labs(x=NULL, y="New cases", title="Potentially anomalous data weeks by state") +
facet_wrap(~state, scales="free")
This algorithm broadly picks out the periods where states show artifact-like spikiness on the rolling-7 case charts. Next steps are to run this process on the smoothed data, as some negative/positive spikes are addressed automatically by that methodology.
A comparison is made between the summed USAF county-level data and the CDC state-level data:
stateSmoothUSAF <- perCapTemp %>%
group_by(state, date) %>%
summarize(across(where(is.numeric), .fns=specNA(sum)), .groups="drop")
stateSmoothUSAF
## # A tibble: 28,917 x 9
## state date cases new_cases smoothCases cumNew cumSmooth smooth7 cases7
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AK 2020-01-22 0 0 0 0 0 NA NA
## 2 AK 2020-01-23 0 0 0 0 0 NA NA
## 3 AK 2020-01-24 0 0 0 0 0 NA NA
## 4 AK 2020-01-25 0 0 0 0 0 0 0
## 5 AK 2020-01-26 0 0 0 0 0 0 0
## 6 AK 2020-01-27 0 0 0 0 0 0 0
## 7 AK 2020-01-28 0 0 0 0 0 0 0
## 8 AK 2020-01-29 0 0 0 0 0 0 0
## 9 AK 2020-01-30 0 0 0 0 0 0 0
## 10 AK 2020-01-31 0 0 0 0 0 0 0
## # ... with 28,907 more rows
stateCDC <- readFromRDS("cdc_daily_210815")$dfPerCapita %>%
select(state, date, cdcCases=new_cases) %>%
mutate(cdcCases=ifelse(is.na(cdcCases), 0, cdcCases)) %>%
arrange(state, date) %>%
group_by(state) %>%
mutate(cdcCase7=zoo::rollmean(cdcCases, k=7, fill=NA)) %>%
ungroup()
stateCDC
## # A tibble: 29,334 x 4
## state date cdcCases cdcCase7
## <chr> <date> <dbl> <dbl>
## 1 AK 2020-01-22 0 NA
## 2 AK 2020-01-23 0 NA
## 3 AK 2020-01-24 0 NA
## 4 AK 2020-01-25 0 0
## 5 AK 2020-01-26 0 0
## 6 AK 2020-01-27 0 0
## 7 AK 2020-01-28 0 0
## 8 AK 2020-01-29 0 0
## 9 AK 2020-01-30 0 0
## 10 AK 2020-01-31 0 0
## # ... with 29,324 more rows
tempPlotData <- stateSmoothUSAF %>%
select(state, date, usafSmooth=smoothCases, usafSmooth7=smooth7) %>%
full_join(stateCDC, by=c("state", "date")) %>%
arrange(date) %>%
group_by(state) %>%
mutate(cumUSAF=cumsum(ifelse(is.na(usafSmooth), 0, usafSmooth)), cumCDC=cumsum(cdcCases)) %>%
ungroup() %>%
pivot_longer(-c(state, date))
tempPlotState <- function(df, keyStates, keyVars) {
p1 <- df %>%
filter(name %in% all_of(keyVars), state %in% all_of(keyStates), !is.na(value)) %>%
ggplot(aes(x=date, y=value)) +
geom_line(aes(group=name, color=name)) +
labs(x=NULL, y="Cases", title="Comparison of cases in CDC and USAF") +
scale_color_discrete("Metric") +
facet_wrap(~state, scales="free_y")
print(p1)
}
# Examples for states that appear well-aligned
tempPlotState(tempPlotData, keyStates=c("IL", "PA"), keyVars=c("usafSmooth", "cdcCases"))
tempPlotState(tempPlotData, keyStates=c("IL", "PA"), keyVars=c("usafSmooth7", "cdcCase7"))
tempPlotState(tempPlotData, keyStates=c("IL", "PA"), keyVars=c("cumCDC", "cumUSAF"))
# Examples for states with anomalies
tempPlotState(tempPlotData, keyStates=c("MO", "FL", "TX", "NJ"), keyVars=c("usafSmooth", "cdcCases"))
tempPlotState(tempPlotData, keyStates=c("MO", "FL", "TX", "NJ"), keyVars=c("usafSmooth7", "cdcCase7"))
tempPlotState(tempPlotData, keyStates=c("MO", "FL", "TX", "NJ"), keyVars=c("cumCDC", "cumUSAF"))
# States with largest disconnects as percentage
tempPlotData %>%
filter(name %in% c("cumUSAF", "cumCDC")) %>%
pivot_wider(c(state, date)) %>%
group_by(state) %>%
summarize(absDelta=sum(abs(cumUSAF-cumCDC)), sumValues=sum(cumUSAF+cumCDC)) %>%
mutate(pct=absDelta/sumValues) %>%
ggplot(aes(x=fct_reorder(state, pct), y=pct)) +
geom_col(fill="lightblue") +
geom_text(aes(label=round(pct, 3)), hjust=0, size=3) +
labs(x=NULL,
y="Percent different in cumsum for CDC and USAF",
title="Cumulative totals in CDC vs USAF",
subtitle="Calculated as absolute value of differences divided by mean of values"
) +
coord_flip()
# Examples for states with large differences
tempPlotState(tempPlotData, keyStates=c("GA", "MI", "RI", "VT", "MA", "CA"), keyVars=c("usafSmooth7", "cdcCase7"))
tempPlotState(tempPlotData, keyStates=c("GA", "MI", "RI", "VT", "MA", "CA"), keyVars=c("cumCDC", "cumUSAF"))
In some states, there is a general trend that county-level sums (USAF) are lower than state-level totals (CDC). In these states, shape of the curve appears to be closely matched between the data sources.
For the more spiky states, it appears that cumulative totals are roughly aligned, but with differences along the way to the cumulative total.
The latest data are downloaded, with existing clusters applied:
readList <- list("usafCase"="./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20210904.csv",
"usafDeath"="./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20210904.csv"
)
compareList <- list("usafCase"=readFromRDS("cty_newdata_20210813")$dfRaw$usafCase,
"usafDeath"=readFromRDS("cty_newdata_20210813")$dfRaw$usafDeath
)
# Create new clusters
cty_newdata_20210904 <- readRunUSAFacts(maxDate="2021-09-02",
downloadTo=lapply(readList,
FUN=function(x) if(file.exists(x)) NA else x
),
readFrom=readList,
compareFile=compareList,
writeLog="./RInputFiles/Coronavirus/USAF_NewData_20210904_v005.log",
ovrwriteLog=TRUE,
useClusters=readFromRDS("cty_newdata_20210813")$useClusters,
skipAssessmentPlots=FALSE,
brewPalette="Paired"
)
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20210904.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character(),
## StateFIPS = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 6
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20210904_v005.log
##
## Checking for similarity of: county
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 2 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210904_v005.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 88 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210904_v005.log
##
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20210904.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character(),
## StateFIPS = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 6
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20210904_v005.log
##
## Checking for similarity of: county
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 2 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210904_v005.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 37 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210904_v005.log
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
## isType cases new_cases n
## <chr> <dbl> <dbl> <dbl>
## 1 before 8.60e+9 3.62e+7 1829589
## 2 after 8.56e+9 3.60e+7 1800366
## 3 pctchg 4.51e-3 4.05e-3 0.0160
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
## isType deaths new_deaths n
## <chr> <dbl> <dbl> <dbl>
## 1 before 1.68e+8 615386 1829589
## 2 after 1.68e+8 610247 1800366
## 3 pctchg 4.03e-3 0.00835 0.0160
## NULL
# Plot all counties based on closest cluster
sparseCountyClusterMap(cty_newdata_20210904$useClusters,
caption="Includes only counties with 25k+ population",
brewPalette="viridis"
)
# Dates with large amounts of negative restatement
cty_newdata_20210904$dfPerCapita %>%
group_by(date) %>%
summarize(nNeg=sum(new_cases < 0), nPos=sum(new_cases > 0), new_cases=sum(new_cases)) %>%
arrange(-nNeg)
## # A tibble: 573 x 4
## date nNeg nPos new_cases
## <date> <int> <int> <dbl>
## 1 2021-07-09 310 1797 -399248
## 2 2020-12-28 266 2372 216179
## 3 2020-12-25 262 1350 49223
## 4 2020-10-24 256 2230 53241
## 5 2021-06-11 205 1706 -9167
## 6 2021-06-23 191 1629 15003
## 7 2021-06-18 184 1442 20155
## 8 2021-06-25 183 1545 24105
## 9 2021-04-16 174 2237 65229
## 10 2021-06-30 174 1610 10814
## # ... with 563 more rows
# Restatements taking place between July 4-14, 2021
cty_newdata_20210904$dfPerCapita %>%
group_by(date) %>%
summarize(nNeg=sum(new_cases < 0), nPos=sum(new_cases > 0), new_cases=sum(new_cases)) %>%
filter(date >= "2021-07-04", date <= "2021-07-14")
## # A tibble: 11 x 4
## date nNeg nPos new_cases
## <date> <int> <int> <dbl>
## 1 2021-07-04 8 333 2737
## 2 2021-07-05 12 395 4228
## 3 2021-07-06 57 1617 27035
## 4 2021-07-07 97 1756 21302
## 5 2021-07-08 129 1613 19276
## 6 2021-07-09 310 1797 -399248
## 7 2021-07-10 17 653 452834
## 8 2021-07-11 8 362 3435
## 9 2021-07-12 59 1596 37864
## 10 2021-07-13 92 1603 23919
## 11 2021-07-14 97 1983 32194
# Restatements taking place between December 21, 2020 and January 6, 2021
cty_newdata_20210904$dfPerCapita %>%
group_by(date) %>%
summarize(nNeg=sum(new_cases < 0), nPos=sum(new_cases > 0), new_cases=sum(new_cases)) %>%
filter(date >= "2020-12-21", date <= "2021-01-06")
## # A tibble: 17 x 4
## date nNeg nPos new_cases
## <date> <int> <int> <dbl>
## 1 2020-12-21 8 2659 362667
## 2 2020-12-22 2 2514 174503
## 3 2020-12-23 0 2567 227200
## 4 2020-12-24 3 2345 185122
## 5 2020-12-25 262 1350 49223
## 6 2020-12-26 33 1964 170262
## 7 2020-12-27 5 2018 121643
## 8 2020-12-28 266 2372 216179
## 9 2020-12-29 5 2510 229421
## 10 2020-12-30 20 2698 238917
## 11 2020-12-31 32 2406 222938
## 12 2021-01-01 4 1603 190655
## 13 2021-01-02 0 2346 283820
## 14 2021-01-03 8 2470 229336
## 15 2021-01-04 4 2484 169859
## 16 2021-01-05 0 2773 238758
## 17 2021-01-06 0 2885 256350
saveToRDS(cty_newdata_20210904, ovrWriteError=FALSE)
##
## File already exists: ./RInputFiles/Coronavirus/cty_newdata_20210904.RDS
##
## Not replacing the existing file since ovrWrite=FALSE
## NULL
The update appears to be missing data after August 16, 2021. Further exploration is needed. It appears that the data have moved to a different link. The urlMapper is updated and the process re-run:
urlMapper[["usafCase"]] <- "https://static.usafacts.org/public/data/covid-19/covid_confirmed_usafacts.csv"
urlMapper[["usafDeath"]] <- "https://static.usafacts.org/public/data/covid-19/covid_deaths_usafacts.csv"
urlMapper[["usafPop"]] <- "https://static.usafacts.org/public/data/covid-19/covid_county_population_usafacts.csv"
readList <- list("usafCase"="./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20210905.csv",
"usafDeath"="./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20210905.csv"
)
compareList <- list("usafCase"=readFromRDS("cty_newdata_20210813")$dfRaw$usafCase,
"usafDeath"=readFromRDS("cty_newdata_20210813")$dfRaw$usafDeath
)
# Create new clusters
cty_newdata_20210905 <- readRunUSAFacts(maxDate="2021-09-03",
downloadTo=lapply(readList,
FUN=function(x) if(file.exists(x)) NA else x
),
readFrom=readList,
compareFile=compareList,
writeLog="./RInputFiles/Coronavirus/USAF_NewData_20210905_v005.log",
ovrwriteLog=TRUE,
useClusters=readFromRDS("cty_newdata_20210813")$useClusters,
skipAssessmentPlots=FALSE,
brewPalette="Paired"
)
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20210905.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character(),
## StateFIPS = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 24
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20210905_v005.log
##
## Checking for similarity of: county
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 17 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210905_v005.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 371 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210905_v005.log
##
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20210905.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character(),
## StateFIPS = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 24
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20210905_v005.log
##
## Checking for similarity of: county
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 16 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210905_v005.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 216 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210905_v005.log
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
## isType cases new_cases n
## <chr> <dbl> <dbl> <dbl>
## 1 before 9.28e+9 3.90e+7 1887063
## 2 after 9.23e+9 3.89e+7 1856922
## 3 pctchg 4.70e-3 4.54e-3 0.0160
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
## isType deaths new_deaths n
## <chr> <dbl> <dbl> <dbl>
## 1 before 1.80e+8 641157 1887063
## 2 after 1.79e+8 630136 1856922
## 3 pctchg 4.80e-3 0.0172 0.0160
## NULL
# Plot all counties based on closest cluster
sparseCountyClusterMap(cty_newdata_20210905$useClusters,
caption="Includes only counties with 25k+ population",
brewPalette="viridis"
)
# Dates with large amounts of negative restatement
cty_newdata_20210905$dfPerCapita %>%
group_by(date) %>%
summarize(nNeg=sum(new_cases < 0), nPos=sum(new_cases > 0), new_cases=sum(new_cases)) %>%
arrange(-nNeg)
## # A tibble: 591 x 4
## date nNeg nPos new_cases
## <date> <int> <int> <dbl>
## 1 2021-07-09 310 1797 -399248
## 2 2020-12-28 266 2372 216179
## 3 2020-12-25 262 1350 49223
## 4 2020-10-24 256 2230 53241
## 5 2021-06-11 205 1706 -9167
## 6 2021-06-23 191 1629 15003
## 7 2021-06-18 184 1442 20155
## 8 2021-06-25 183 1545 24105
## 9 2021-04-16 174 2237 65229
## 10 2021-06-30 174 1610 10814
## # ... with 581 more rows
# Restatements taking place between July 4-14, 2021
cty_newdata_20210905$dfPerCapita %>%
group_by(date) %>%
summarize(nNeg=sum(new_cases < 0), nPos=sum(new_cases > 0), new_cases=sum(new_cases)) %>%
filter(date >= "2021-07-04", date <= "2021-07-14")
## # A tibble: 11 x 4
## date nNeg nPos new_cases
## <date> <int> <int> <dbl>
## 1 2021-07-04 8 333 2737
## 2 2021-07-05 12 395 4228
## 3 2021-07-06 57 1617 27035
## 4 2021-07-07 97 1756 21302
## 5 2021-07-08 129 1613 19276
## 6 2021-07-09 310 1797 -399248
## 7 2021-07-10 17 653 452834
## 8 2021-07-11 8 362 3435
## 9 2021-07-12 59 1596 37864
## 10 2021-07-13 92 1603 23919
## 11 2021-07-14 97 1983 32194
# Restatements taking place between December 21, 2020 and January 6, 2021
cty_newdata_20210905$dfPerCapita %>%
group_by(date) %>%
summarize(nNeg=sum(new_cases < 0), nPos=sum(new_cases > 0), new_cases=sum(new_cases)) %>%
filter(date >= "2020-12-21", date <= "2021-01-06")
## # A tibble: 17 x 4
## date nNeg nPos new_cases
## <date> <int> <int> <dbl>
## 1 2020-12-21 8 2659 362667
## 2 2020-12-22 2 2514 174503
## 3 2020-12-23 0 2567 227200
## 4 2020-12-24 3 2345 185122
## 5 2020-12-25 262 1350 49223
## 6 2020-12-26 33 1964 170262
## 7 2020-12-27 5 2018 121643
## 8 2020-12-28 266 2372 216179
## 9 2020-12-29 5 2510 229421
## 10 2020-12-30 20 2698 238917
## 11 2020-12-31 32 2406 222938
## 12 2021-01-01 4 1603 190655
## 13 2021-01-02 0 2346 283820
## 14 2021-01-03 8 2470 229336
## 15 2021-01-04 4 2484 169859
## 16 2021-01-05 0 2773 238758
## 17 2021-01-06 0 2885 256350
saveToRDS(cty_newdata_20210905, ovrWriteError=FALSE)
Data are updated as expected. As in previous downloads, there are some large restatements that take place in a few key weeks and states.
Plots are created for total deaths per capita by county:
# Create data for total burden and change in burden over past 28 days
burdenSummary <- cty_newdata_20210905$dfPerCapita %>%
group_by(countyFIPS, state) %>%
summarize(asofDate=max(date),
tdpm=sum(ifelse(date==max(date), tdpm, 0)),
tcpm=sum(ifelse(date==max(date), tcpm, 0)),
dpm28=sum(ifelse(date>max(date)-28, dpm, 0)),
cpm28=sum(ifelse(date>max(date)-28, cpm, 0)),
.groups="drop"
)
burdenSummary
## # A tibble: 3,142 x 7
## countyFIPS state asofDate tdpm tcpm dpm28 cpm28
## <chr> <chr> <date> <dbl> <dbl> <dbl> <dbl>
## 1 01001 AL 2021-09-03 2130. 159462. 89.5 21747.
## 2 01003 AL 2021-09-03 1756. 151361. 278. 31209.
## 3 01005 AL 2021-09-03 2633. 127279. 81.0 22563.
## 4 01007 AL 2021-09-03 3304. 159864. 357. 27105.
## 5 01009 AL 2021-09-03 2525. 151541. 104. 21115.
## 6 01011 AL 2021-09-03 4158. 138501. 0 13068.
## 7 01013 AL 2021-09-03 3908. 147367. 206. 20362.
## 8 01015 AL 2021-09-03 3125. 161172. 176. 23221.
## 9 01017 AL 2021-09-03 3819. 145546. 60.1 23516.
## 10 01019 AL 2021-09-03 1871. 96541. 38.2 18705.
## # ... with 3,132 more rows
# Create histograms for burden metrics
burdenSummary %>%
pivot_longer(-c(countyFIPS, state, asofDate)) %>%
ggplot(aes(x=value)) +
geom_histogram() +
facet_wrap(~name, scales="free_x")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Negative values in the data (likely due to restatement)
burdenSummary %>%
pivot_longer(-c(countyFIPS, state, asofDate)) %>%
filter(value < 0)
## # A tibble: 16 x 5
## countyFIPS state asofDate name value
## <chr> <chr> <date> <chr> <dbl>
## 1 02013 AK 2021-09-03 dpm28 -899.
## 2 02013 AK 2021-09-03 cpm28 -83908.
## 3 02016 AK 2021-09-03 cpm28 -93184.
## 4 02240 AK 2021-09-03 cpm28 -1596.
## 5 02261 AK 2021-09-03 cpm28 -28472.
## 6 08005 CO 2021-09-03 dpm28 -9.14
## 7 08055 CO 2021-09-03 dpm28 -145.
## 8 13257 GA 2021-09-03 dpm28 -38.6
## 9 17011 IL 2021-09-03 dpm28 -30.6
## 10 20081 KS 2021-09-03 dpm28 -252.
## 11 20203 KS 2021-09-03 dpm28 -472.
## 12 26033 MI 2021-09-03 dpm28 -26.8
## 13 37065 NC 2021-09-03 dpm28 -19.4
## 14 37127 NC 2021-09-03 dpm28 -10.6
## 15 41045 OR 2021-09-03 dpm28 -32.7
## 16 44005 RI 2021-09-03 dpm28 -122.
# Create histograms for burden metrics, with all values made no less than 0
metricLabels <- c("cpm28"="Cases per million (most recent 28 days)",
"dpm28"="Deaths per million (most recent 28 days)",
"tcpm"="Cases per million (aggregate)",
"tdpm"="Deaths per million (aggregate)"
)
burdenSummary %>%
pivot_longer(-c(countyFIPS, state, asofDate)) %>%
ggplot(aes(x=pmax(0, value))) +
geom_histogram(fill="lightblue", bins=50) +
facet_wrap(~metricLabels[name], scales="free") +
labs(title="Coronavirus burden by county as of Sep 3, 20201",
caption="Data from USA Facts",
x="Burden",
y="# Counties"
)
# Create plot of cases (past 28 days)
burdenSummary %>%
select(fips=countyFIPS, cpm28) %>%
mutate(cpm28=pmin(pmax(cpm28, 0), 25000)/1000) %>%
usmap::plot_usmap(regions="counties", exclude="NE", data=., values="cpm28") +
scale_fill_continuous("CPM 000s\n(28 days)", low="grey", high="red") +
labs(title="Cases per million (past 28 days) by county as of Sep 3, 2021",
subtitle="Capped at 25k",
caption="Source: USA Facts\nNebraska excluded due to unreliable data"
)
# Create plot of deaths (past 28 days)
burdenSummary %>%
select(fips=countyFIPS, dpm28) %>%
mutate(dpm28=pmin(pmax(dpm28, 0), 250)) %>%
usmap::plot_usmap(regions="counties", exclude=c("NE", "FL"), data=., values="dpm28") +
scale_fill_continuous("DPM 000s\n(28 days)", low="grey", high="red") +
labs(title="Deaths per million (past 28 days) by county as of Sep 3, 2021",
subtitle="Capped at 250",
caption="Source: USA Facts\nNebraska and Florida excluded due to unreliable data"
)
Further exploration of Nebraska and Florida is merited:
# Attache county name and population data
burdenPop <- burdenSummary %>%
left_join(getCountyData(), by=c("countyFIPS", "state"))
burdenPop
## # A tibble: 3,142 x 9
## countyFIPS state asofDate tdpm tcpm dpm28 cpm28 countyName pop
## <chr> <chr> <date> <dbl> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 01001 AL 2021-09-03 2130. 159462. 89.5 21747. Autauga County 55869
## 2 01003 AL 2021-09-03 1756. 151361. 278. 31209. Baldwin County 223234
## 3 01005 AL 2021-09-03 2633. 127279. 81.0 22563. Barbour County 24686
## 4 01007 AL 2021-09-03 3304. 159864. 357. 27105. Bibb County 22394
## 5 01009 AL 2021-09-03 2525. 151541. 104. 21115. Blount County 57826
## 6 01011 AL 2021-09-03 4158. 138501. 0 13068. Bullock County 10101
## 7 01013 AL 2021-09-03 3908. 147367. 206. 20362. Butler County 19448
## 8 01015 AL 2021-09-03 3125. 161172. 176. 23221. Calhoun County 113605
## 9 01017 AL 2021-09-03 3819. 145546. 60.1 23516. Chambers County 33254
## 10 01019 AL 2021-09-03 1871. 96541. 38.2 18705. Cherokee County 26196
## # ... with 3,132 more rows
# Counties of at least 25k with non-positive cases and/or deaths in the past 28 days
burdenZero <- burdenPop %>%
pivot_longer(c(tdpm, tcpm, dpm28, cpm28)) %>%
filter(name %in% c("dpm28", "cpm28"), value <= 0) %>%
mutate(is25k=ifelse(pop >= 25000, "Y", "N"))
burdenZero %>%
group_by(state, name) %>%
summarize(n=n(), pop=sum(pop), value=sum(value), n25k=sum(is25k=="Y"),.groups="drop") %>%
arrange(-pop)
## # A tibble: 53 x 6
## state name n pop value n25k
## <chr> <chr> <int> <dbl> <dbl> <int>
## 1 FL dpm28 67 21477737 0 53
## 2 NJ dpm28 21 8882190 0 21
## 3 NE cpm28 93 1934408 0 12
## 4 NE dpm28 93 1934408 0 12
## 5 CO dpm28 20 890924 -154. 4
## 6 VA dpm28 44 889505 0 10
## 7 MN dpm28 45 771080 0 9
## 8 IA dpm28 49 749669 0 5
## 9 CA dpm28 14 631858 0 6
## 10 WI dpm28 24 597268 0 10
## # ... with 43 more rows
# Create the baseline map
p1 <- burdenZero %>%
pivot_wider() %>%
mutate(noCPM=(!is.na(cpm28) & is.na(dpm28)),
noDPM=(!is.na(dpm28) & is.na(cpm28)),
noBoth=(!is.na(cpm28) & !is.na(dpm28)),
ctyType=factor(case_when(is25k=="N" ~ "<25k",
noBoth ~ "Both",
noDPM ~ "Death",
noCPM ~ "Cases",
TRUE ~ "Logic Error"
)
)
) %>%
select(fips=countyFIPS, ctyType) %>%
usmap::plot_usmap(regions="counties", values="ctyType", data=.)
p1
# Map with state overlays and legend
p2 <- ggplot() +
geom_polygon(data=p1[[1]],
aes(x=x, y=y, group=group, fill=p1[[1]]$ctyType),
color = NA,
size = 0.1
) +
geom_polygon(data=usmap::plot_usmap("states")[[1]],
aes(x=x, y=y, group=group),
color = "black",
lwd=1.25,
fill = alpha(0.001)
) +
coord_equal() +
ggthemes::theme_map() +
labs(title="Counties reporting non-positive cases and/or deaths in the past 28 days",
subtitle="County-level data as of Sep 3, 2021",
caption="Source: USA Facts"
) +
scale_fill_brewer("Missing\nMetric", na.value="white", palette="Set2")
p2
Missing data are common in Florida (deaths), New Jersey (deaths), and Nebraska (cases and deaths). Smaller counties may not have any cases or (especially) deaths in a month, and all other states some counties reporting burdens in the past 28 days.